feat(recon): Implement pre-generation active API discovery

vishnurajkv · vishnurajkv · commit e8e72ee77108 · 2026-03-09T12:37:43.000+01:00
- Added  allowing for active probing before test case generation
- Extracts dynamic undocumented endpoints into the baseline SchemaStructure
- Allows the AI to view these undocumented endpoints natively
diff --git a/src/secnodeapi/ai/__init__.py b/src/secnodeapi/ai/__init__.py
@@ -4,11 +4,12 @@
 
 from .generate import generate_test_cases
 from .understand import understand_api_with_ai
-from .validate import classify_findings, validate_findings_with_ai
+from .validate import classify_findings, validate_findings_with_ai, deduplicate_findings_with_ai
 
 __all__ = [
     "understand_api_with_ai",
     "generate_test_cases",
     "classify_findings",
     "validate_findings_with_ai",
+    "deduplicate_findings_with_ai",
 ]
diff --git a/src/secnodeapi/ai/llm_client.py b/src/secnodeapi/ai/llm_client.py
@@ -39,7 +39,14 @@ async def call_llm(system_prompt: str, user_prompt: str, temperature: float = 0.
                     completion_kwargs["api_base"] = api_base
             elif provider == "nebius":
                 # Nebius is OpenAI-compatible and requires its specific base URL
-                completion_kwargs["api_base"] = "https://api.tokenfactory.nebius.com/v1/"
+                # LiteLLM requires 'openai/' prefix for compatible endpoints
+                if "/" in model:
+                    _, model_name = model.split("/", 1)
+                    completion_kwargs["model"] = f"openai/{model_name}"
+                    
+                api_base = os.getenv("NEBIUS_API_BASE", "https://api.tokenfactory.nebius.com/v1/").strip()
+                if api_base:
+                    completion_kwargs["api_base"] = api_base
                 completion_kwargs["api_key"] = os.getenv("NEBIUS_API_KEY", "")
 
             response = await acompletion(
diff --git a/src/secnodeapi/ai/validate.py b/src/secnodeapi/ai/validate.py
@@ -208,3 +208,77 @@ async def validate_findings_with_ai(results: List[TestResult]) -> List[Finding]:
     """Backwards-compatible wrapper returning only confirmed findings."""
     confirmed, _ = await classify_findings(results)
     return confirmed
+
+
+class AIDeduplicationCluster(BaseModel):
+    reasoning: str
+    test_case_ids: List[str]
+
+class AIDeduplicationPayload(BaseModel):
+    clusters: List[AIDeduplicationCluster]
+
+
+async def deduplicate_findings_with_ai(findings: List[Finding]) -> List[Finding]:
+    """Analyze confirmed findings to merge duplicates sharing the exact same root cause."""
+    if len(findings) <= 1:
+        return findings
+
+    logger.info("Running AI deduplication analysis", count=len(findings))
+
+    summarized_findings = []
+    for f in findings:
+        summarized_findings.append({
+            "test_case_id": f.test_case_id,
+            "endpoint": f.endpoint,
+            "method": f.method,
+            "vulnerability_class": f.vulnerability_class,
+            "description": f.description,
+            "remediation": f.remediation,
+        })
+
+    sys_prompt = (
+        "You are a senior AppSec engineer triaging a list of validated API vulnerabilities. "
+        "Your goal is to deduplicate findings that represent the EXACT same underlying root cause and require the exact same fix. "
+        "For example, multiple BOLA findings on `/api/users/1` and `/api/users/2` share the exact same root cause. "
+        "Group them into clusters. "
+        "Return ONLY valid JSON with the exact schema:\n"
+        '{"clusters": [{"reasoning": "str explaining why these represent the same root cause", "test_case_ids": ["id1", "id2"]}]}'
+        "Every single finding from the input MUST belong to exactly one cluster. If a finding is unique, it should be in a cluster by itself."
+    )
+    user_prompt = f"Validated Findings to cluster:\n{json.dumps(summarized_findings, indent=2)}"
+
+    try:
+        llm_resp = await call_llm(sys_prompt, user_prompt, temperature=0.1)
+        payload = AIDeduplicationPayload.model_validate(json.loads(llm_resp))
+        
+        finding_map = {f.test_case_id: f for f in findings}
+        deduplicated: List[Finding] = []
+
+        for cluster in payload.clusters:
+            cluster_findings = [
+                finding_map[tid] for tid in cluster.test_case_ids if tid in finding_map
+            ]
+            if not cluster_findings:
+                continue
+                
+            if len(cluster_findings) > 1:
+                logger.info("AI Merge Decision", reasoning=cluster.reasoning, merged_count=len(cluster_findings), kept_id=cluster_findings[0].test_case_id)
+            
+            # Keep the finding with the highest CVSS or confidence
+            top_finding = sorted(
+                cluster_findings, 
+                key=lambda x: (x.cvss_score, x.confidence), 
+                reverse=True
+            )[0]
+            deduplicated.append(top_finding)
+
+        if len(deduplicated) > 0 and len(deduplicated) <= len(findings):
+             logger.info("AI deduplication complete", original=len(findings), final=len(deduplicated))
+             return deduplicated
+             
+    except (json.JSONDecodeError, ValidationError) as e:
+        logger.warning("AI deduplication schema validation failed", error=str(e))
+    except Exception as e:
+        logger.warning("AI deduplication evaluation failed", error=str(e))
+
+    return findings
diff --git a/src/secnodeapi/cli.py b/src/secnodeapi/cli.py
@@ -40,6 +40,7 @@
     validate_and_retest,
     write_report,
 )
+from .ai import deduplicate_findings_with_ai
 from .services.controller import ControllerService
 from .test_executor import execute_proactive_tests
 
@@ -323,6 +324,9 @@ async def _run_full_pipeline(args, pipeline_input: PipelineInput) -> None:
         proxy=pipeline_input.proxy,
         verify_ssl=pipeline_input.verify_ssl,
     )
+    
+    findings = await deduplicate_findings_with_ai(findings)
+    
     report = build_report(api_structure.title, findings)
     output_dir = write_report(report, args.target)
     logger.info("SecNode pipeline completed successfully", output_dir=output_dir)
diff --git a/src/secnodeapi/services/pipeline.py b/src/secnodeapi/services/pipeline.py
@@ -12,6 +12,7 @@
 
 from ..ai import (
     classify_findings,
+    deduplicate_findings_with_ai,
     generate_test_cases,
     understand_api_with_ai,
     validate_findings_with_ai,
@@ -162,48 +163,10 @@ def _apply_identity_variants(
     return variants
 
 
-def _mutate_path(path: str) -> List[str]:
-    mutations = set()
-    stripped = path.rstrip("/")
-    if "/v1/" in stripped:
-        mutations.add(stripped.replace("/v1/", "/v2/", 1))
-    if not stripped.endswith("/admin"):
-        mutations.add(f"{stripped}/admin")
-    if "{id}" in stripped:
-        mutations.add(stripped.replace("{id}", "1"))
-        mutations.add(stripped.replace("{id}", "2"))
-    return [m for m in mutations if m and m != path]
-
-
-def _build_discovery_tests(endpoints: List[APIEndpoint]) -> List[TestCase]:
-    discovery_tests: List[TestCase] = []
-    for idx, endpoint in enumerate(endpoints):
-        existing_method = endpoint.method.upper()
-        for probe_method in ("GET", "POST", "PUT", "DELETE"):
-            if probe_method == existing_method:
-                continue
-            discovery_tests.append(
-                TestCase(
-                    id=f"DISCOVER-{idx}-{probe_method}",
-                    name="Method probing",
-                    description="Probe undocumented method support",
-                    owasp_category="API9: Improper Inventory Management",
-                    endpoint=endpoint.path,
-                    method=probe_method,
-                )
-            )
-        for mutation_idx, mutated_path in enumerate(_mutate_path(endpoint.path)):
-            discovery_tests.append(
-                TestCase(
-                    id=f"MUTATE-{idx}-{mutation_idx}",
-                    name="Path mutation",
-                    description="Probe possible shadow or alternate API path",
-                    owasp_category="API9: Improper Inventory Management",
-                    endpoint=mutated_path,
-                    method="GET",
-                )
-            )
-    return discovery_tests
+
+
+
+
 
 
 def _extract_candidate_ids(response_body: str) -> List[str]:
@@ -318,10 +281,8 @@ async def run_agent_pipeline(
     """
     api_structure, seed_tests = await build_pipeline_artifacts(pipeline_input)
     identities = _resolve_identities(pipeline_input)
-    discovery_tests = _build_discovery_tests(api_structure.endpoints)
-
     queue = _deduplicate_test_cases(
-        _apply_identity_variants(seed_tests + discovery_tests, identities)
+        _apply_identity_variants(seed_tests, identities)
     )
     confirmed: List[Finding] = []
     suspected: List[Finding] = []
@@ -366,6 +327,9 @@ async def run_agent_pipeline(
         )
         queue = _deduplicate_test_cases(queue + chain_tests)
 
+    # Run final AI deduplication
+    confirmed = await deduplicate_findings_with_ai(confirmed)
+
     metrics = {
         "iterations": iteration,
         "requests_attempted": pipeline_input.request_budget - remaining_budget,
@@ -378,16 +342,30 @@ async def run_agent_pipeline(
     return api_structure, confirmed, suspected, metrics
 
 
+from .recon import perform_active_recon
+
 async def build_pipeline_artifacts(
     pipeline_input: PipelineInput,
 ) -> tuple:
-    """Run schema load, understanding, and test generation."""
+    """Run schema load, active recon, understanding, and test generation."""
     raw_schema = await fetch_schema(
         pipeline_input.target,
         proxy=pipeline_input.proxy,
         verify_ssl=pipeline_input.verify_ssl,
     )
     api_structure = analyze_api_structure(raw_schema)
+    
+    # Inject Active Reconnaissance
+    discovered_endpoints = await perform_active_recon(api_structure, pipeline_input)
+    if discovered_endpoints:
+        api_structure.endpoints.extend(discovered_endpoints)
+        
+        # Deduplicate endpoints by path and method
+        unique_endpoints = {}
+        for ep in api_structure.endpoints:
+            unique_endpoints[(ep.path, ep.method)] = ep
+        api_structure.endpoints = list(unique_endpoints.values())
+
     understanding = await understand_api_with_ai(api_structure)
     tests = await generate_test_cases(understanding, api_structure, instructions=pipeline_input.instructions[0].model_dump() if pipeline_input.instructions else None)
     return api_structure, tests
diff --git a/src/secnodeapi/services/recon.py b/src/secnodeapi/services/recon.py
@@ -0,0 +1,120 @@
+"""
+Active API Reconnaissance & Discovery Service.
+Probes for hidden or undocumented endpoints and parameters before AI analysis.
+"""
+
+from typing import List, Dict, Set
+import structlog
+
+from ..vulnerability_models import APIEndpoint, SchemaStructure, TestCase
+from .pipeline import PipelineInput
+from ..test_executor import execute_proactive_tests_detailed
+
+logger = structlog.get_logger(__name__)
+
+
+def _mutate_path(path: str) -> List[str]:
+    """Generate potential hidden paths based on existing routes."""
+    mutations: Set[str] = set()
+    stripped = path.rstrip("/")
+    if "/v1/" in stripped:
+        mutations.add(stripped.replace("/v1/", "/v2/", 1))
+        mutations.add(stripped.replace("/v1/", "/internal/", 1))
+    if not stripped.endswith("/admin"):
+        mutations.add(f"{stripped}/admin")
+    if "{id}" in stripped:
+        mutations.add(stripped.replace("{id}", "1"))
+        mutations.add(stripped.replace("{id}", "2"))
+    return [m for m in mutations if m and m != path]
+
+
+def _build_discovery_tests(endpoints: List[APIEndpoint]) -> List[TestCase]:
+    """Generate exploratory test cases for undocumented methods and mutated paths."""
+    discovery_tests: List[TestCase] = []
+    
+    # Track which methods we already know exist for each path
+    existing_methods_by_path: Dict[str, Set[str]] = {}
+    for endpoint in endpoints:
+        if endpoint.path not in existing_methods_by_path:
+            existing_methods_by_path[endpoint.path] = set()
+        existing_methods_by_path[endpoint.path].add(endpoint.method.upper())
+
+    # Generate Method Probes and Path Mutations
+    idx = 0
+    for path, known_methods in existing_methods_by_path.items():
+        # Probe for undocumented HTTP methods
+        for probe_method in ("GET", "POST", "PUT", "DELETE", "PATCH"):
+            if probe_method in known_methods:
+                continue
+            discovery_tests.append(
+                TestCase(
+                    id=f"DISCOVER-{idx}-{probe_method}",
+                    name="Method probing",
+                    description="Probe undocumented method support",
+                    owasp_category="API9: Improper Inventory Management",
+                    endpoint=path,
+                    method=probe_method,
+                )
+            )
+            idx += 1
+
+        # Probe for path mutations (e.g. /v2/ instead of /v1/)
+        for mutation_idx, mutated_path in enumerate(_mutate_path(path)):
+            discovery_tests.append(
+                TestCase(
+                    id=f"MUTATE-{idx}-{mutation_idx}",
+                    name="Path mutation",
+                    description="Probe possible shadow or alternate API path",
+                    owasp_category="API9: Improper Inventory Management",
+                    endpoint=mutated_path,
+                    method="GET",
+                )
+            )
+            idx += 1
+
+    return discovery_tests
+
+
+async def perform_active_recon(
+    api_structure: SchemaStructure, pipeline_input: PipelineInput
+) -> List[APIEndpoint]:
+    """
+    Actively scan the application for undocumented endpoints and parameters.
+    Returns a list of newly discovered APIEndpoints.
+    """
+    logger.info("Starting active reconnaissance for hidden endpoints")
+    
+    discovery_tests = _build_discovery_tests(api_structure.endpoints)
+    if not discovery_tests:
+        return []
+
+    results, _ = await execute_proactive_tests_detailed(
+        test_cases=discovery_tests,
+        base_url=api_structure.base_url,
+        concurrency=pipeline_input.concurrency,
+        auth_headers=pipeline_input.auth_headers,
+        proxy=pipeline_input.proxy,
+        verify_ssl=pipeline_input.verify_ssl,
+        max_requests=len(discovery_tests), 
+    )
+
+    discovered_endpoints: List[APIEndpoint] = []
+    
+    for result in results:
+        status = result.status_code
+        # A 2xx indicates the endpoint definitely exists.
+        # A 401/403/405 also indicates the endpoint exists, just restricted or wrong method logic.
+        # 404 indicates it doesn't exist.
+        if (200 <= status < 300) or status in (401, 403, 405):
+            new_endpoint = APIEndpoint(
+                path=result.test_case.endpoint,
+                method=result.test_case.method.upper(),
+                operation_id=f"recon_discovered_{result.test_case.method.lower()}",
+                parameters=[],  # We could try to infer parameters from errors, but keep basic for now
+                auth_required=(status in (401, 403)),
+                request_body_schema={"present": False}  # Hard to infer from recon probes without explicit fuzzing
+            )
+            discovered_endpoints.append(new_endpoint)
+            logger.info("Discovered undocumented endpoint", method=new_endpoint.method, path=new_endpoint.path, status=status)
+
+    return discovered_endpoints