From 75b6e9f5c311c69d6eae57b4d444b8b4d1dd9c80 Mon Sep 17 00:00:00 2001
From: Wolfvin <wolfvin@users.noreply.github.com>
Date: Mon, 29 Jun 2026 15:46:19 +0000
Subject: [PATCH] fix: enforce 3000-file cap on _auto_setup fallback path (#34)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Issue #34: _auto_setup's subprocess scan passes --max-files 3000 on the
CLI, but commands/scan.add_args did not register --max-files, so argparse
rejected it (exit 2) every time. The fallback cmd_scan(workspace,
incremental=False) was therefore ALWAYS taken — with no cap and no
timeout — while the result hint still claimed 'Auto-setup capped at
3000 files'. On huge repos this could hang indefinitely.

Fix:
- Add max_files: Optional[int] param to cmd_scan + _cap_discovered_files
  helper that truncates per-category file lists so total <= max_files.
- Register --max-files in commands/scan.add_args so the subprocess path
  actually works (previously dead code).
- Rework _auto_setup so the in-process fallback calls
  cmd_scan(..., max_files=_AUTO_SETUP_MAX_FILES) — same cap as the
  subprocess path.
- Surface capped and fallback flags on result['_auto_setup'] so MCP
  clients/agents can tell which path produced the registry and whether
  the cap was hit (explicitly requested in issue #34).

Tests (tests/test_cli.py::TestAutoSetupFallbackCap):
- test_fallback_passes_max_files_cap: monkeypatch subprocess.run to
  raise, spy on cmd_scan, assert max_files=3000 is passed.
- test_fallback_sets_capped_and_fallback_flags: 3001-file workspace +
  forced fallback, drive full codelens.main() flow, assert
  result['_auto_setup']['capped'] is True and ['fallback'] is True.
- test_main_path_no_fallback_when_subprocess_succeeds: sanity guard
  that the main path still works (fallback=False, capped=False for a
  small workspace) and the flags are always present in the schema.
---
 scripts/codelens.py      |  86 +++++++++++++-----
 scripts/commands/scan.py |  37 +++++++-
 tests/test_cli.py        | 191 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 287 insertions(+), 27 deletions(-)

diff --git a/scripts/codelens.py b/scripts/codelens.py
index 83b63a6..de8e546 100755
--- a/scripts/codelens.py
+++ b/scripts/codelens.py
@@ -244,14 +244,25 @@ def _registry_exists(workspace: str) -> bool:
 
 def _auto_setup(workspace: str) -> Dict[str, Any]:
     """Auto-run init + scan when no registry exists. Returns scan result or error info.
-    
-    Includes timeout protection: if the workspace has many source files,
-    limits scan to --max-files 3000 to prevent long auto-setup times.
+
+    Applies a hard cap of ``_AUTO_SETUP_MAX_FILES`` (3000) files on BOTH the
+    subprocess path and the in-process fallback path, so auto-setup can never
+    silently hang on huge repos (issue #34).
+
+    Returns a dict with:
+      - ``auto_setup``: "ok" | "failed"
+      - ``capped``: True iff the 3000-file cap was reached (only on success)
+      - ``fallback``: True iff the in-process fallback path was taken (only on success)
+      - ``files_scanned``: total files scanned (only on success)
+      - ``hint``: human-readable note (only present when ``capped`` is True)
+      - ``stage`` / ``error``: failure details (only on failure)
     """
     from commands.init import cmd_init
     from commands.scan import cmd_scan
+    import subprocess
 
-    # Cap to prevent 5+ minute auto-setup on large repos
+    # Cap to prevent 5+ minute auto-setup on large repos.
+    # Applied to BOTH the subprocess path and the in-process fallback.
     _AUTO_SETUP_MAX_FILES = 3000
     _AUTO_SETUP_TIMEOUT_MSG = (
         "Auto-setup running with --max-files 3000 to prevent timeout. "
@@ -268,37 +279,57 @@ def _auto_setup(workspace: str) -> Dict[str, Any]:
     except Exception as e:
         return {"auto_setup": "failed", "stage": "init", "error": str(e)}
 
-    # Step 2: Scan (with max-files cap for auto-setup)
+    # Step 2: Scan (with --max-files cap on BOTH paths)
     try:
         print(f"[CodeLens] {_AUTO_SETUP_TIMEOUT_MSG}", file=sys.stderr)
-        # Use subprocess to run scan with --max-files flag
-        # This avoids coupling to cmd_scan's internal signature
+        # Primary path: subprocess with --max-files flag (timeout=120s).
+        # This isolates the scan in a child process so we can enforce a
+        # hard wall-clock timeout on top of the file-count cap.
         scan_cmd = [sys.executable, os.path.join(SCRIPT_DIR, "codelens.py"),
                      "scan", workspace, "--max-files", str(_AUTO_SETUP_MAX_FILES)]
-        scan_proc = __import__("subprocess").run(
-            scan_cmd, capture_output=True, text=True, timeout=120
-        )
-        if scan_proc.returncode != 0:
-            # Fallback: try without max-files
-            try:
-                scan_result = cmd_scan(workspace, incremental=False)
-                if scan_result.get("status") != "ok":
-                    return {"auto_setup": "failed", "stage": "scan", "error": scan_result}
-            except Exception as e2:
-                return {"auto_setup": "failed", "stage": "scan", "error": str(e2)}
-        else:
-            scan_result = json.loads(scan_proc.stdout) if scan_proc.stdout.strip() else {"status": "ok"}
+        fallback_taken = False
+        scan_result: Optional[Dict[str, Any]] = None
+        try:
+            scan_proc = subprocess.run(
+                scan_cmd, capture_output=True, text=True, timeout=120
+            )
+            if scan_proc.returncode == 0:
+                scan_result = (
+                    json.loads(scan_proc.stdout)
+                    if scan_proc.stdout.strip()
+                    else {"status": "ok"}
+                )
+        except Exception as e:
+            print(f"[CodeLens] Scan subprocess error: {e}; "
+                  "falling back to in-process scan.", file=sys.stderr)
+
+        # Fallback path: in-process scan with the SAME max_files cap.
+        # The cap is enforced by cmd_scan(max_files=...) so huge repos
+        # cannot hang auto-setup even when the subprocess path fails.
+        if scan_result is None:
+            fallback_taken = True
+            print(f"[CodeLens] Falling back to in-process scan "
+                  f"with max_files={_AUTO_SETUP_MAX_FILES}.", file=sys.stderr)
+            scan_result = cmd_scan(
+                workspace, incremental=False, max_files=_AUTO_SETUP_MAX_FILES
+            )
+            if scan_result.get("status") != "ok":
+                return {"auto_setup": "failed", "stage": "scan", "error": scan_result}
 
         files_scanned = scan_result.get("files_scanned", {})
         total_files = sum(v for v in files_scanned.values() if isinstance(v, int)) if isinstance(files_scanned, dict) else 0
-        print(f"[CodeLens] Auto-setup complete. {total_files} files scanned. Registry built.", file=sys.stderr)
-        
-        result_info = {
+        capped = total_files >= _AUTO_SETUP_MAX_FILES
+        print(f"[CodeLens] Auto-setup complete. {total_files} files scanned. "
+              f"Registry built. (fallback={fallback_taken}, capped={capped})",
+              file=sys.stderr)
+
+        result_info: Dict[str, Any] = {
             "auto_setup": "ok",
             "files_scanned": total_files,
-            "capped": total_files >= _AUTO_SETUP_MAX_FILES,
+            "capped": capped,
+            "fallback": fallback_taken,
         }
-        if total_files >= _AUTO_SETUP_MAX_FILES:
+        if capped:
             result_info["hint"] = "Auto-setup capped at 3000 files. Run 'scan' manually for full analysis."
         return result_info
     except Exception as e:
@@ -955,6 +986,11 @@ def main():
             auto_setup_info = {
                 "auto_setup": True,
                 "message": "Registry was auto-built. For best results, run 'scan' manually on large repos.",
+                # Issue #34: surface which path produced the registry and
+                # whether the 3000-file cap was hit, so MCP clients / agents
+                # can decide whether to trust the registry or re-scan.
+                "capped": bool(auto_setup_result.get("capped", False)),
+                "fallback": bool(auto_setup_result.get("fallback", False)),
             }
         else:
             auto_setup_info = {
diff --git a/scripts/commands/scan.py b/scripts/commands/scan.py
index dd2142d..1ef5f96 100644
--- a/scripts/commands/scan.py
+++ b/scripts/commands/scan.py
@@ -52,25 +52,33 @@ def add_args(parser):
                         help="Only re-scan changed files")
     parser.add_argument("--plugins", nargs="*", default=None,
                         help="Enable plugin rules: specify plugin names or 'all' for all rule_pack plugins")
+    parser.add_argument("--max-files", type=int, default=None,
+                        help="Cap total files scanned (default: unlimited). "
+                             "Used by auto-setup to prevent timeout on huge repos.")
 
 
 def execute(args, workspace):
     """Execute the scan command."""
     incremental = getattr(args, 'incremental', False)
     plugins = getattr(args, 'plugins', None)
+    max_files = getattr(args, 'max_files', None)
     # Only auto-enable incremental if the user didn't explicitly request a full scan
     # and the registry already exists. We check for explicit --incremental flag.
     # Note: When user runs "scan" without --incremental, they expect a full scan.
     # Auto-incremental was causing confusion where 2nd scan would miss changes.
     # Now: explicit --incremental for incremental, bare "scan" for full scan.
-    return cmd_scan(workspace, incremental, plugins=plugins)
+    return cmd_scan(workspace, incremental, plugins=plugins, max_files=max_files)
 
 
-def cmd_scan(workspace: str, incremental: bool = False, plugins: Optional[list] = None) -> Dict[str, Any]:
+def cmd_scan(workspace: str, incremental: bool = False, plugins: Optional[list] = None,
+             max_files: Optional[int] = None) -> Dict[str, Any]:
     """
     Scan the workspace and build/update the registry.
+
     If incremental=True, only re-scan changed files.
     If plugins is provided, load plugin rules for the scan.
+    If max_files is provided and > 0, cap the total number of discovered files
+    that get parsed (used by auto-setup to prevent timeout on huge repos).
     """
     workspace = os.path.abspath(workspace)
     config = load_config(workspace)
@@ -88,6 +96,12 @@ def cmd_scan(workspace: str, incremental: bool = False, plugins: Optional[list]
     # Discover files
     files = discover_files(workspace, config)
 
+    # Apply max_files cap (auto-setup uses this to bound scan time on huge repos).
+    # The cap is applied AFTER discovery but BEFORE parsing, so os.walk cost is
+    # unchanged but parsing/registry-build cost is bounded.
+    if max_files is not None and max_files > 0:
+        files = _cap_discovered_files(files, max_files)
+
     # Check if incremental scan is possible
     changed_files = None
     if incremental:
@@ -1172,6 +1186,25 @@ def _build_lang_note(fw: Dict) -> Optional[str]:
     return f"Detected {', '.join(parts)} source files — these languages do not have dedicated parsers yet. CodeLens uses regex-based fallback extraction for many languages, but analysis may be less accurate than for fully supported languages (JS/TS/Python/Rust/HTML/CSS). Note: Go, Java, Kotlin, C/C++, C#, Ruby, Elixir, Dart, Swift, Scala, Shell, PHP, GDScript, Lua, and Objective-C all have fallback parsers; they are listed here only when no parser exists."
 
 
+def _cap_discovered_files(files: Dict[str, List[str]], max_files: int) -> Dict[str, List[str]]:
+    """Cap total files across all categories to ``max_files``.
+
+    Truncates per-category lists in dict iteration order until the budget
+    is exhausted; remaining categories are emptied. Used by auto-setup to
+    bound scan time on huge repos (issue #34).
+    """
+    capped: Dict[str, List[str]] = {}
+    remaining = max_files
+    for key, file_list in files.items():
+        if not file_list or remaining <= 0:
+            capped[key] = []
+            continue
+        take = file_list[:remaining]
+        capped[key] = take
+        remaining -= len(take)
+    return capped
+
+
 def discover_files(workspace: str, config: Dict) -> Dict[str, List[str]]:
     """
     Discover all relevant source files in the workspace.
diff --git a/tests/test_cli.py b/tests/test_cli.py
index c572507..fd077dc 100755
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -331,3 +331,194 @@ def test_check_full_cli_invocation_with_positional(self):
         finally:
             import shutil
             shutil.rmtree(ws, ignore_errors=True)
+
+
+# ─── _auto_setup fallback cap (issue #34) ───────────────────────────
+
+
+class TestAutoSetupFallbackCap:
+    """Regression tests for issue #34.
+
+    ``_auto_setup`` in scripts/codelens.py runs scan via subprocess with
+    ``--max-files 3000`` as a timeout guard. When the subprocess failed
+    (non-zero exit / exception), the fallback called
+    ``cmd_scan(workspace, incremental=False)`` with NO cap and NO timeout,
+    so huge repos could hang auto-setup indefinitely — while the result
+    hint still claimed "Auto-setup capped at 3000 files" (a lie).
+
+    Fix: the fallback must pass ``max_files=_AUTO_SETUP_MAX_FILES`` to
+    ``cmd_scan``, and ``result["_auto_setup"]`` must surface ``capped`` and
+    ``fallback`` flags so MCP clients can tell which path produced the
+    registry.
+    """
+
+    def test_fallback_passes_max_files_cap(self, monkeypatch):
+        """When the subprocess scan fails, the fallback ``cmd_scan`` call
+        must be invoked with ``max_files=3000`` (not uncapped)."""
+        import subprocess
+        from commands import scan as scan_mod
+        import codelens
+
+        ws = _create_sample_workspace()
+        try:
+            # Force subprocess.run to raise so the fallback path is taken.
+            def fake_run(*args, **kwargs):
+                raise subprocess.SubprocessError("simulated subprocess failure")
+            monkeypatch.setattr(subprocess, "run", fake_run)
+
+            # Spy on cmd_scan to capture the max_files kwarg.
+            captured = {}
+            real_cmd_scan = scan_mod.cmd_scan
+
+            def spy_cmd_scan(workspace, incremental=False, plugins=None, max_files=None):
+                captured["called"] = True
+                captured["max_files"] = max_files
+                captured["incremental"] = incremental
+                # Delegate to the real cmd_scan so the registry actually
+                # gets built (otherwise _auto_setup would fail downstream).
+                return real_cmd_scan(
+                    workspace, incremental=incremental,
+                    plugins=plugins, max_files=max_files,
+                )
+            monkeypatch.setattr(scan_mod, "cmd_scan", spy_cmd_scan)
+
+            result = codelens._auto_setup(ws)
+
+            assert captured.get("called") is True, (
+                "Fallback path did not call cmd_scan at all"
+            )
+            assert captured.get("max_files") == 3000, (
+                f"Fallback must pass max_files=3000 to cmd_scan; "
+                f"got: {captured.get('max_files')!r}"
+            )
+            assert result.get("auto_setup") == "ok"
+            assert result.get("fallback") is True
+        finally:
+            import shutil
+            shutil.rmtree(ws, ignore_errors=True)
+
+    def test_fallback_sets_capped_and_fallback_flags(self, monkeypatch, capsys):
+        """``result['_auto_setup']`` must include ``capped=True`` and
+        ``fallback=True`` when the fallback path runs against a workspace
+        large enough to actually hit the 3000-file cap.
+
+        Verifies issue #34's Definition of Done item #2: the hint that
+        says "capped at 3000 files" must no longer be a lie.
+
+        Drives the full CLI flow (``codelens.main()``) so the assertion
+        is on the actual ``result["_auto_setup"]`` dict that gets attached
+        to the command's JSON output, not just on ``_auto_setup()``'s
+        private return value.
+        """
+        import subprocess
+        import codelens
+
+        # Build a workspace with > 3000 source files so the cap is hit.
+        ws = tempfile.mkdtemp()
+        try:
+            os.makedirs(os.path.join(ws, "src"), exist_ok=True)
+            for i in range(3001):
+                with open(os.path.join(ws, "src", f"f{i}.py"), "w") as fh:
+                    fh.write(f"def f{i}():\n    pass\n")
+
+            # Force subprocess.run to raise → fallback path is taken.
+            # This patches subprocess.run in THIS process, which is the
+            # same process codelens.main() runs in, so the inner
+            # subprocess call inside _auto_setup will hit the patch.
+            def fake_run(*args, **kwargs):
+                raise subprocess.SubprocessError("simulated subprocess failure")
+            monkeypatch.setattr(subprocess, "run", fake_run)
+
+            # Drive the full CLI flow in-process so we can assert on the
+            # actual result["_auto_setup"] attached to the JSON output.
+            old_argv = sys.argv
+            sys.argv = ["codelens.py", "list", ws, "--format", "json"]
+            try:
+                codelens.main()
+            except SystemExit as e:
+                # main() may sys.exit(0) on success or sys.exit(1) on gate
+                # failure (for `check`). For `list`, success path returns
+                # normally or exits 0.
+                assert e.code in (0, None), (
+                    f"unexpected exit code from main(): {e.code}"
+                )
+            finally:
+                sys.argv = old_argv
+
+            captured = capsys.readouterr()
+            assert captured.out.strip().startswith("{"), (
+                f"expected JSON on stdout; got: {captured.out[:300]!r}"
+            )
+            result = json.loads(captured.out.strip())
+            auto = result.get("_auto_setup")
+            assert auto is not None, (
+                "result['_auto_setup'] missing from CLI output; "
+                f"got keys: {list(result.keys())}"
+            )
+            assert auto.get("fallback") is True, (
+                f"expected fallback=True after subprocess failure; "
+                f"got _auto_setup={auto!r}"
+            )
+            assert auto.get("capped") is True, (
+                f"expected capped=True when workspace has >3000 files; "
+                f"got _auto_setup={auto!r}"
+            )
+        finally:
+            import shutil
+            shutil.rmtree(ws, ignore_errors=True)
+
+    def test_main_path_no_fallback_when_subprocess_succeeds(self, capsys):
+        """Sanity guard for Definition of Done item #3: when the subprocess
+        path succeeds, ``fallback`` must be False and the auto-setup flags
+        must still be present on ``result["_auto_setup"]``.
+
+        Also confirms the issue #34 fix didn't break the main path: with
+        ``--max-files`` now a registered scan argument, the subprocess no
+        longer exits 2 on argparse rejection, so the main path actually
+        runs end-to-end (previously it silently failed every time and the
+        fallback was always taken).
+        """
+        import codelens
+
+        ws = _create_sample_workspace()
+        try:
+            # Real subprocess (no monkeypatching). With the fix,
+            # `--max-files` is now a valid scan arg, so the subprocess
+            # should succeed and the fallback should NOT be taken.
+            old_argv = sys.argv
+            sys.argv = ["codelens.py", "list", ws, "--format", "json"]
+            try:
+                codelens.main()
+            except SystemExit as e:
+                assert e.code in (0, None), (
+                    f"unexpected exit code from main(): {e.code}"
+                )
+            finally:
+                sys.argv = old_argv
+
+            captured = capsys.readouterr()
+            assert captured.out.strip().startswith("{"), (
+                f"expected JSON on stdout; got: {captured.out[:300]!r}"
+            )
+            result = json.loads(captured.out.strip())
+            auto = result.get("_auto_setup")
+            assert auto is not None, (
+                f"result['_auto_setup'] missing; got keys: {list(result.keys())}"
+            )
+            # Sample workspace has ~4 files (html, css, js, rs) — well below
+            # the 3000-file cap, so capped must be False.
+            assert auto.get("fallback") is False, (
+                f"expected fallback=False on subprocess success; "
+                f"got _auto_setup={auto!r}"
+            )
+            assert auto.get("capped") is False, (
+                f"expected capped=False for small workspace; "
+                f"got _auto_setup={auto!r}"
+            )
+            # Flags must always be present (even when False) so MCP clients
+            # can rely on the schema.
+            assert "capped" in auto, "capped flag missing from _auto_setup"
+            assert "fallback" in auto, "fallback flag missing from _auto_setup"
+        finally:
+            import shutil
+            shutil.rmtree(ws, ignore_errors=True)