From 75b6e9f5c311c69d6eae57b4d444b8b4d1dd9c80 Mon Sep 17 00:00:00 2001 From: Wolfvin Date: Mon, 29 Jun 2026 15:46:19 +0000 Subject: [PATCH] fix: enforce 3000-file cap on _auto_setup fallback path (#34) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #34: _auto_setup's subprocess scan passes --max-files 3000 on the CLI, but commands/scan.add_args did not register --max-files, so argparse rejected it (exit 2) every time. The fallback cmd_scan(workspace, incremental=False) was therefore ALWAYS taken — with no cap and no timeout — while the result hint still claimed 'Auto-setup capped at 3000 files'. On huge repos this could hang indefinitely. Fix: - Add max_files: Optional[int] param to cmd_scan + _cap_discovered_files helper that truncates per-category file lists so total <= max_files. - Register --max-files in commands/scan.add_args so the subprocess path actually works (previously dead code). - Rework _auto_setup so the in-process fallback calls cmd_scan(..., max_files=_AUTO_SETUP_MAX_FILES) — same cap as the subprocess path. - Surface capped and fallback flags on result['_auto_setup'] so MCP clients/agents can tell which path produced the registry and whether the cap was hit (explicitly requested in issue #34). Tests (tests/test_cli.py::TestAutoSetupFallbackCap): - test_fallback_passes_max_files_cap: monkeypatch subprocess.run to raise, spy on cmd_scan, assert max_files=3000 is passed. - test_fallback_sets_capped_and_fallback_flags: 3001-file workspace + forced fallback, drive full codelens.main() flow, assert result['_auto_setup']['capped'] is True and ['fallback'] is True. - test_main_path_no_fallback_when_subprocess_succeeds: sanity guard that the main path still works (fallback=False, capped=False for a small workspace) and the flags are always present in the schema. --- scripts/codelens.py | 86 +++++++++++++----- scripts/commands/scan.py | 37 +++++++- tests/test_cli.py | 191 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 287 insertions(+), 27 deletions(-) diff --git a/scripts/codelens.py b/scripts/codelens.py index 83b63a6..de8e546 100755 --- a/scripts/codelens.py +++ b/scripts/codelens.py @@ -244,14 +244,25 @@ def _registry_exists(workspace: str) -> bool: def _auto_setup(workspace: str) -> Dict[str, Any]: """Auto-run init + scan when no registry exists. Returns scan result or error info. - - Includes timeout protection: if the workspace has many source files, - limits scan to --max-files 3000 to prevent long auto-setup times. + + Applies a hard cap of ``_AUTO_SETUP_MAX_FILES`` (3000) files on BOTH the + subprocess path and the in-process fallback path, so auto-setup can never + silently hang on huge repos (issue #34). + + Returns a dict with: + - ``auto_setup``: "ok" | "failed" + - ``capped``: True iff the 3000-file cap was reached (only on success) + - ``fallback``: True iff the in-process fallback path was taken (only on success) + - ``files_scanned``: total files scanned (only on success) + - ``hint``: human-readable note (only present when ``capped`` is True) + - ``stage`` / ``error``: failure details (only on failure) """ from commands.init import cmd_init from commands.scan import cmd_scan + import subprocess - # Cap to prevent 5+ minute auto-setup on large repos + # Cap to prevent 5+ minute auto-setup on large repos. + # Applied to BOTH the subprocess path and the in-process fallback. _AUTO_SETUP_MAX_FILES = 3000 _AUTO_SETUP_TIMEOUT_MSG = ( "Auto-setup running with --max-files 3000 to prevent timeout. " @@ -268,37 +279,57 @@ def _auto_setup(workspace: str) -> Dict[str, Any]: except Exception as e: return {"auto_setup": "failed", "stage": "init", "error": str(e)} - # Step 2: Scan (with max-files cap for auto-setup) + # Step 2: Scan (with --max-files cap on BOTH paths) try: print(f"[CodeLens] {_AUTO_SETUP_TIMEOUT_MSG}", file=sys.stderr) - # Use subprocess to run scan with --max-files flag - # This avoids coupling to cmd_scan's internal signature + # Primary path: subprocess with --max-files flag (timeout=120s). + # This isolates the scan in a child process so we can enforce a + # hard wall-clock timeout on top of the file-count cap. scan_cmd = [sys.executable, os.path.join(SCRIPT_DIR, "codelens.py"), "scan", workspace, "--max-files", str(_AUTO_SETUP_MAX_FILES)] - scan_proc = __import__("subprocess").run( - scan_cmd, capture_output=True, text=True, timeout=120 - ) - if scan_proc.returncode != 0: - # Fallback: try without max-files - try: - scan_result = cmd_scan(workspace, incremental=False) - if scan_result.get("status") != "ok": - return {"auto_setup": "failed", "stage": "scan", "error": scan_result} - except Exception as e2: - return {"auto_setup": "failed", "stage": "scan", "error": str(e2)} - else: - scan_result = json.loads(scan_proc.stdout) if scan_proc.stdout.strip() else {"status": "ok"} + fallback_taken = False + scan_result: Optional[Dict[str, Any]] = None + try: + scan_proc = subprocess.run( + scan_cmd, capture_output=True, text=True, timeout=120 + ) + if scan_proc.returncode == 0: + scan_result = ( + json.loads(scan_proc.stdout) + if scan_proc.stdout.strip() + else {"status": "ok"} + ) + except Exception as e: + print(f"[CodeLens] Scan subprocess error: {e}; " + "falling back to in-process scan.", file=sys.stderr) + + # Fallback path: in-process scan with the SAME max_files cap. + # The cap is enforced by cmd_scan(max_files=...) so huge repos + # cannot hang auto-setup even when the subprocess path fails. + if scan_result is None: + fallback_taken = True + print(f"[CodeLens] Falling back to in-process scan " + f"with max_files={_AUTO_SETUP_MAX_FILES}.", file=sys.stderr) + scan_result = cmd_scan( + workspace, incremental=False, max_files=_AUTO_SETUP_MAX_FILES + ) + if scan_result.get("status") != "ok": + return {"auto_setup": "failed", "stage": "scan", "error": scan_result} files_scanned = scan_result.get("files_scanned", {}) total_files = sum(v for v in files_scanned.values() if isinstance(v, int)) if isinstance(files_scanned, dict) else 0 - print(f"[CodeLens] Auto-setup complete. {total_files} files scanned. Registry built.", file=sys.stderr) - - result_info = { + capped = total_files >= _AUTO_SETUP_MAX_FILES + print(f"[CodeLens] Auto-setup complete. {total_files} files scanned. " + f"Registry built. (fallback={fallback_taken}, capped={capped})", + file=sys.stderr) + + result_info: Dict[str, Any] = { "auto_setup": "ok", "files_scanned": total_files, - "capped": total_files >= _AUTO_SETUP_MAX_FILES, + "capped": capped, + "fallback": fallback_taken, } - if total_files >= _AUTO_SETUP_MAX_FILES: + if capped: result_info["hint"] = "Auto-setup capped at 3000 files. Run 'scan' manually for full analysis." return result_info except Exception as e: @@ -955,6 +986,11 @@ def main(): auto_setup_info = { "auto_setup": True, "message": "Registry was auto-built. For best results, run 'scan' manually on large repos.", + # Issue #34: surface which path produced the registry and + # whether the 3000-file cap was hit, so MCP clients / agents + # can decide whether to trust the registry or re-scan. + "capped": bool(auto_setup_result.get("capped", False)), + "fallback": bool(auto_setup_result.get("fallback", False)), } else: auto_setup_info = { diff --git a/scripts/commands/scan.py b/scripts/commands/scan.py index dd2142d..1ef5f96 100644 --- a/scripts/commands/scan.py +++ b/scripts/commands/scan.py @@ -52,25 +52,33 @@ def add_args(parser): help="Only re-scan changed files") parser.add_argument("--plugins", nargs="*", default=None, help="Enable plugin rules: specify plugin names or 'all' for all rule_pack plugins") + parser.add_argument("--max-files", type=int, default=None, + help="Cap total files scanned (default: unlimited). " + "Used by auto-setup to prevent timeout on huge repos.") def execute(args, workspace): """Execute the scan command.""" incremental = getattr(args, 'incremental', False) plugins = getattr(args, 'plugins', None) + max_files = getattr(args, 'max_files', None) # Only auto-enable incremental if the user didn't explicitly request a full scan # and the registry already exists. We check for explicit --incremental flag. # Note: When user runs "scan" without --incremental, they expect a full scan. # Auto-incremental was causing confusion where 2nd scan would miss changes. # Now: explicit --incremental for incremental, bare "scan" for full scan. - return cmd_scan(workspace, incremental, plugins=plugins) + return cmd_scan(workspace, incremental, plugins=plugins, max_files=max_files) -def cmd_scan(workspace: str, incremental: bool = False, plugins: Optional[list] = None) -> Dict[str, Any]: +def cmd_scan(workspace: str, incremental: bool = False, plugins: Optional[list] = None, + max_files: Optional[int] = None) -> Dict[str, Any]: """ Scan the workspace and build/update the registry. + If incremental=True, only re-scan changed files. If plugins is provided, load plugin rules for the scan. + If max_files is provided and > 0, cap the total number of discovered files + that get parsed (used by auto-setup to prevent timeout on huge repos). """ workspace = os.path.abspath(workspace) config = load_config(workspace) @@ -88,6 +96,12 @@ def cmd_scan(workspace: str, incremental: bool = False, plugins: Optional[list] # Discover files files = discover_files(workspace, config) + # Apply max_files cap (auto-setup uses this to bound scan time on huge repos). + # The cap is applied AFTER discovery but BEFORE parsing, so os.walk cost is + # unchanged but parsing/registry-build cost is bounded. + if max_files is not None and max_files > 0: + files = _cap_discovered_files(files, max_files) + # Check if incremental scan is possible changed_files = None if incremental: @@ -1172,6 +1186,25 @@ def _build_lang_note(fw: Dict) -> Optional[str]: return f"Detected {', '.join(parts)} source files — these languages do not have dedicated parsers yet. CodeLens uses regex-based fallback extraction for many languages, but analysis may be less accurate than for fully supported languages (JS/TS/Python/Rust/HTML/CSS). Note: Go, Java, Kotlin, C/C++, C#, Ruby, Elixir, Dart, Swift, Scala, Shell, PHP, GDScript, Lua, and Objective-C all have fallback parsers; they are listed here only when no parser exists." +def _cap_discovered_files(files: Dict[str, List[str]], max_files: int) -> Dict[str, List[str]]: + """Cap total files across all categories to ``max_files``. + + Truncates per-category lists in dict iteration order until the budget + is exhausted; remaining categories are emptied. Used by auto-setup to + bound scan time on huge repos (issue #34). + """ + capped: Dict[str, List[str]] = {} + remaining = max_files + for key, file_list in files.items(): + if not file_list or remaining <= 0: + capped[key] = [] + continue + take = file_list[:remaining] + capped[key] = take + remaining -= len(take) + return capped + + def discover_files(workspace: str, config: Dict) -> Dict[str, List[str]]: """ Discover all relevant source files in the workspace. diff --git a/tests/test_cli.py b/tests/test_cli.py index c572507..fd077dc 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -331,3 +331,194 @@ def test_check_full_cli_invocation_with_positional(self): finally: import shutil shutil.rmtree(ws, ignore_errors=True) + + +# ─── _auto_setup fallback cap (issue #34) ─────────────────────────── + + +class TestAutoSetupFallbackCap: + """Regression tests for issue #34. + + ``_auto_setup`` in scripts/codelens.py runs scan via subprocess with + ``--max-files 3000`` as a timeout guard. When the subprocess failed + (non-zero exit / exception), the fallback called + ``cmd_scan(workspace, incremental=False)`` with NO cap and NO timeout, + so huge repos could hang auto-setup indefinitely — while the result + hint still claimed "Auto-setup capped at 3000 files" (a lie). + + Fix: the fallback must pass ``max_files=_AUTO_SETUP_MAX_FILES`` to + ``cmd_scan``, and ``result["_auto_setup"]`` must surface ``capped`` and + ``fallback`` flags so MCP clients can tell which path produced the + registry. + """ + + def test_fallback_passes_max_files_cap(self, monkeypatch): + """When the subprocess scan fails, the fallback ``cmd_scan`` call + must be invoked with ``max_files=3000`` (not uncapped).""" + import subprocess + from commands import scan as scan_mod + import codelens + + ws = _create_sample_workspace() + try: + # Force subprocess.run to raise so the fallback path is taken. + def fake_run(*args, **kwargs): + raise subprocess.SubprocessError("simulated subprocess failure") + monkeypatch.setattr(subprocess, "run", fake_run) + + # Spy on cmd_scan to capture the max_files kwarg. + captured = {} + real_cmd_scan = scan_mod.cmd_scan + + def spy_cmd_scan(workspace, incremental=False, plugins=None, max_files=None): + captured["called"] = True + captured["max_files"] = max_files + captured["incremental"] = incremental + # Delegate to the real cmd_scan so the registry actually + # gets built (otherwise _auto_setup would fail downstream). + return real_cmd_scan( + workspace, incremental=incremental, + plugins=plugins, max_files=max_files, + ) + monkeypatch.setattr(scan_mod, "cmd_scan", spy_cmd_scan) + + result = codelens._auto_setup(ws) + + assert captured.get("called") is True, ( + "Fallback path did not call cmd_scan at all" + ) + assert captured.get("max_files") == 3000, ( + f"Fallback must pass max_files=3000 to cmd_scan; " + f"got: {captured.get('max_files')!r}" + ) + assert result.get("auto_setup") == "ok" + assert result.get("fallback") is True + finally: + import shutil + shutil.rmtree(ws, ignore_errors=True) + + def test_fallback_sets_capped_and_fallback_flags(self, monkeypatch, capsys): + """``result['_auto_setup']`` must include ``capped=True`` and + ``fallback=True`` when the fallback path runs against a workspace + large enough to actually hit the 3000-file cap. + + Verifies issue #34's Definition of Done item #2: the hint that + says "capped at 3000 files" must no longer be a lie. + + Drives the full CLI flow (``codelens.main()``) so the assertion + is on the actual ``result["_auto_setup"]`` dict that gets attached + to the command's JSON output, not just on ``_auto_setup()``'s + private return value. + """ + import subprocess + import codelens + + # Build a workspace with > 3000 source files so the cap is hit. + ws = tempfile.mkdtemp() + try: + os.makedirs(os.path.join(ws, "src"), exist_ok=True) + for i in range(3001): + with open(os.path.join(ws, "src", f"f{i}.py"), "w") as fh: + fh.write(f"def f{i}():\n pass\n") + + # Force subprocess.run to raise → fallback path is taken. + # This patches subprocess.run in THIS process, which is the + # same process codelens.main() runs in, so the inner + # subprocess call inside _auto_setup will hit the patch. + def fake_run(*args, **kwargs): + raise subprocess.SubprocessError("simulated subprocess failure") + monkeypatch.setattr(subprocess, "run", fake_run) + + # Drive the full CLI flow in-process so we can assert on the + # actual result["_auto_setup"] attached to the JSON output. + old_argv = sys.argv + sys.argv = ["codelens.py", "list", ws, "--format", "json"] + try: + codelens.main() + except SystemExit as e: + # main() may sys.exit(0) on success or sys.exit(1) on gate + # failure (for `check`). For `list`, success path returns + # normally or exits 0. + assert e.code in (0, None), ( + f"unexpected exit code from main(): {e.code}" + ) + finally: + sys.argv = old_argv + + captured = capsys.readouterr() + assert captured.out.strip().startswith("{"), ( + f"expected JSON on stdout; got: {captured.out[:300]!r}" + ) + result = json.loads(captured.out.strip()) + auto = result.get("_auto_setup") + assert auto is not None, ( + "result['_auto_setup'] missing from CLI output; " + f"got keys: {list(result.keys())}" + ) + assert auto.get("fallback") is True, ( + f"expected fallback=True after subprocess failure; " + f"got _auto_setup={auto!r}" + ) + assert auto.get("capped") is True, ( + f"expected capped=True when workspace has >3000 files; " + f"got _auto_setup={auto!r}" + ) + finally: + import shutil + shutil.rmtree(ws, ignore_errors=True) + + def test_main_path_no_fallback_when_subprocess_succeeds(self, capsys): + """Sanity guard for Definition of Done item #3: when the subprocess + path succeeds, ``fallback`` must be False and the auto-setup flags + must still be present on ``result["_auto_setup"]``. + + Also confirms the issue #34 fix didn't break the main path: with + ``--max-files`` now a registered scan argument, the subprocess no + longer exits 2 on argparse rejection, so the main path actually + runs end-to-end (previously it silently failed every time and the + fallback was always taken). + """ + import codelens + + ws = _create_sample_workspace() + try: + # Real subprocess (no monkeypatching). With the fix, + # `--max-files` is now a valid scan arg, so the subprocess + # should succeed and the fallback should NOT be taken. + old_argv = sys.argv + sys.argv = ["codelens.py", "list", ws, "--format", "json"] + try: + codelens.main() + except SystemExit as e: + assert e.code in (0, None), ( + f"unexpected exit code from main(): {e.code}" + ) + finally: + sys.argv = old_argv + + captured = capsys.readouterr() + assert captured.out.strip().startswith("{"), ( + f"expected JSON on stdout; got: {captured.out[:300]!r}" + ) + result = json.loads(captured.out.strip()) + auto = result.get("_auto_setup") + assert auto is not None, ( + f"result['_auto_setup'] missing; got keys: {list(result.keys())}" + ) + # Sample workspace has ~4 files (html, css, js, rs) — well below + # the 3000-file cap, so capped must be False. + assert auto.get("fallback") is False, ( + f"expected fallback=False on subprocess success; " + f"got _auto_setup={auto!r}" + ) + assert auto.get("capped") is False, ( + f"expected capped=False for small workspace; " + f"got _auto_setup={auto!r}" + ) + # Flags must always be present (even when False) so MCP clients + # can rely on the schema. + assert "capped" in auto, "capped flag missing from _auto_setup" + assert "fallback" in auto, "fallback flag missing from _auto_setup" + finally: + import shutil + shutil.rmtree(ws, ignore_errors=True)