From 6f4511f302665db1f595bc7b91724361689f338d Mon Sep 17 00:00:00 2001 From: sciapanCA Date: Tue, 9 Jun 2026 21:07:39 +0200 Subject: [PATCH 1/2] Add datasource relevance filter support (--query) Mirror the backend/MCP datasource relevance filter in the skill: - get_datasources() accepts an optional natural-language query, sends it as ?query=, parses the X-CodeAlive-Total-Data-Sources header, and returns a {dataSources, message} envelope with fail-open detection - datasources.py gains a --query flag, renders relevanceReason per source and the omitted-count / fail-open message - SKILL.md, workflows reference, and the context-explorer agent now recommend passing the user's task as --query - Bump plugin version to 2.1.0 Co-Authored-By: Claude Fable 5 --- .claude-plugin/plugin.json | 2 +- agents/codealive-context-explorer.md | 11 ++- skills/codealive-context-engine/SKILL.md | 20 ++++- .../references/workflows.md | 10 ++- .../scripts/datasources.py | 57 ++++++++++-- .../scripts/lib/api_client.py | 86 +++++++++++++++++-- tests/test_cli_smoke.py | 65 ++++++++++++++ tests/test_setup_and_client.py | 60 +++++++++++++ 8 files changed, 283 insertions(+), 28 deletions(-) diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 4fe8224..1a7cdb5 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "codealive", "description": "CodeAlive context engine for semantic code search and AI-powered codebase Q&A. Enables AI coding agents to understand entire codebases beyond just open files — search across all indexed repositories, trace cross-service dependencies, discover usage patterns, and get synthesized answers to architectural questions. Includes a lightweight code exploration subagent, authentication hooks, and multiple search modes (fast lexical, semantic, and deep cross-cutting). Works standalone or alongside the CodeAlive MCP server for direct tool access via the Model Context Protocol.", - "version": "2.0.9", + "version": "2.1.0", "author": { "name": "CodeAlive AI", "email": "hello@codealive.ai" diff --git a/agents/codealive-context-explorer.md b/agents/codealive-context-explorer.md index a0a9740..f338ebe 100644 --- a/agents/codealive-context-explorer.md +++ b/agents/codealive-context-explorer.md @@ -16,7 +16,7 @@ You are a code exploration specialist. **Your default tool is CodeAlive — not Unless the request is unambiguously a local-only file lookup ("read line 42 of foo.ts", "is bar.py in this repo"), your first turn MUST include both of these calls before any answer: ```bash -python scripts/datasources.py +python scripts/datasources.py --query "" python scripts/search.py "" ``` @@ -28,9 +28,12 @@ The scripts directory is relative to the skill location. If a path fails, fall b ### 1. List data sources — run FIRST every session ```bash -python scripts/datasources.py +python scripts/datasources.py --query "" ``` -Without this you do not know what to search against. Instant, free, cheap. +Without this you do not know what to search against. Pass the user's question as `--query` so +the backend returns only the relevant sources, each with a `relevanceReason`. The output tells +you when sources were omitted, and when filtering was unavailable (the full list is returned +instead — fail-open). Omit `--query` only when the user asks for the complete inventory. ### 2. Semantic search — your default discovery tool ```bash @@ -64,7 +67,7 @@ Use after `search.py` or `fetch.py` to expand a call graph, inheritance, or symb Standard loop, in order: -1. **`datasources.py`** — every session, no exceptions. +1. **`datasources.py --query ""`** — every session, no exceptions. The relevance-filtered shortlist tells you what to search against; if a source you expected is missing, rerun without `--query` to see the full list. 2. **`search.py`** with the main concept — every session, no exceptions. Run it even when you have a guess; the search confirms or refutes it with real evidence. 3. **`grep.py`** for specific identifiers, error messages, or config keys surfaced in step 2. 4. **`fetch.py`** on the most relevant identifiers (descriptions are triage pointers only — never reason from them). diff --git a/skills/codealive-context-engine/SKILL.md b/skills/codealive-context-engine/SKILL.md index 466833d..0f3c9d7 100644 --- a/skills/codealive-context-engine/SKILL.md +++ b/skills/codealive-context-engine/SKILL.md @@ -37,7 +37,7 @@ Do NOT retry the failed script until setup completes successfully. | Tool | Script | Speed | Cost | Best For | |------|--------|-------|------|----------| -| **List Data Sources** | `datasources.py` | Instant | Free | Discovering indexed repos and workspaces | +| **List Data Sources** | `datasources.py` | Instant | Free | Discovering indexed repos and workspaces. With `--query "task"`, runs an AI relevance filter (low cost, not instant) returning only the relevant sources | | **Semantic Search** | `search.py` | Fast | Low | Default discovery — finds code by meaning (concepts, behavior, architecture) | | **Grep Search** | `grep.py` | Fast | Low | Finds code containing a specific string or regex (identifiers, literals, patterns) | | **Fetch Artifacts** | `fetch.py` | Fast | Low | Retrieving full content; function-like artifacts also include up to 3 outgoing/incoming calls as a preview | @@ -106,9 +106,13 @@ logic. ### 1. Discover what's indexed ```bash -python scripts/datasources.py +python scripts/datasources.py --query "the user's task in natural language" ``` +Recommended: pass the user's task as `--query` so the backend returns only the relevant +data sources, each with a `relevanceReason`. Omit `--query` to list everything (instant, +no AI filtering). + ### 2. Search for code (fast, cheap) ```bash @@ -151,11 +155,21 @@ python scripts/chat.py "What about security considerations?" --continue CONV_ID ### `datasources.py` — List Data Sources ```bash -python scripts/datasources.py # Ready-to-use sources +python scripts/datasources.py --query "add OAuth to checkout" # Only sources relevant to a task (recommended) +python scripts/datasources.py # Ready-to-use sources (full list) python scripts/datasources.py --all # All (including processing) python scripts/datasources.py --json # JSON output ``` +| Option | Description | +|--------|-------------| +| `--query "TASK"` | The user's task/intent in natural language. The backend runs an AI relevance filter and returns only the relevant sources, each with a `relevanceReason`. Recommended whenever you know what the user is trying to accomplish | +| `--all` | Include sources still processing | +| `--json` | Raw JSON output (with `--query`: `{"dataSources": [...], "message": "..."}`) | + +**Fail-open:** if relevance filtering is unavailable, the FULL list is returned and the +output says so — check the message before treating the result as a relevant shortlist. + ### `search.py` — Semantic Code Search (default discovery tool) The default starting point. Finds code by WHAT it does — concepts, behavior, diff --git a/skills/codealive-context-engine/references/workflows.md b/skills/codealive-context-engine/references/workflows.md index 789d5f9..d85a4b3 100644 --- a/skills/codealive-context-engine/references/workflows.md +++ b/skills/codealive-context-engine/references/workflows.md @@ -20,9 +20,13 @@ Complete workflows for common code exploration scenarios using CodeAlive. ### Step 1: Discover Available Code ```bash -python datasources.py +python datasources.py --query "your task in natural language" ``` +Pass your task as `--query` to get only the relevant data sources, each with a +`relevanceReason` (recommended when you know the goal). Run plain `python datasources.py` +for the complete inventory. + Review output to understand: - What repositories are indexed - What workspaces group related repos @@ -287,8 +291,8 @@ python grep.py "useMemo|useCallback|React.memo" workspace:all-frontend --regex ### Day 1: Get Overview ```bash -# Discover what's indexed -python datasources.py +# Discover what's indexed (relevance-filtered to the onboarding goal) +python datasources.py --query "onboard to the new-service codebase" # Find entry points and main features python search.py "main application entry point, startup initialization" new-service diff --git a/skills/codealive-context-engine/scripts/datasources.py b/skills/codealive-context-engine/scripts/datasources.py index 477cc1f..91413be 100755 --- a/skills/codealive-context-engine/scripts/datasources.py +++ b/skills/codealive-context-engine/scripts/datasources.py @@ -6,11 +6,16 @@ Includes current project repos, dependencies, libraries, and organizational codebases. Usage: - python datasources.py # Show ready-to-use data sources - python datasources.py --all # Show all data sources (including processing) - python datasources.py --json # Output as JSON + python datasources.py # Show ready-to-use data sources + python datasources.py --query "TASK" # Show only sources relevant to a task (recommended) + python datasources.py --all # Show all data sources (including processing) + python datasources.py --json # Output as JSON Examples: + # RECOMMENDED when you know the task: only sources relevant to it, each with a + # relevanceReason explaining the match + python datasources.py --query "add OAuth to the checkout flow" + # List ready data sources python datasources.py @@ -19,6 +24,10 @@ # Get JSON output for parsing python datasources.py --json + +Note: + --query runs an AI relevance filter on the backend. It fails open: if filtering is + unavailable, the FULL list is returned and the output says so. """ import sys @@ -31,17 +40,27 @@ from api_client import CodeAliveClient -def format_datasources(datasources: list, as_json: bool = False) -> str: - """Format data sources for display.""" +def format_datasources(datasources: list, as_json: bool = False, message: str = "") -> str: + """Format data sources for display. + + `message` is the relevance hint accompanying a --query'd listing: how many sources + were omitted as non-relevant, or that filtering was unavailable and the list is full. + """ if as_json: + if message: + return json.dumps({"dataSources": datasources, "message": message}, indent=2) return json.dumps(datasources, indent=2) if not datasources: + if message: + return f"No data sources matched.\nℹ️ {message}" return "No data sources found.\nAdd repositories at https://app.codealive.ai" output = [] output.append(f"\n📚 Available Data Sources ({len(datasources)} total)\n") output.append("="*80) + if message: + output.append(f"\nℹ️ {message}") # Group by type repos = [ds for ds in datasources if ds.get("type") == "Repository"] @@ -58,6 +77,8 @@ def format_datasources(datasources: list, as_json: bool = False) -> str: status = f" [{state}]" if state and state != "Alive" else "" output.append(f"\n 📁 {name}{status}") output.append(f" {desc}") + if ws.get("relevanceReason"): + output.append(f" 🎯 {ws['relevanceReason']}") if repos: output.append("\n\n📦 REPOSITORIES") @@ -71,6 +92,8 @@ def format_datasources(datasources: list, as_json: bool = False) -> str: status = f" [{state}]" if state and state != "Alive" else "" output.append(f"\n 📄 {name}{status}") output.append(f" {desc}") + if repo.get("relevanceReason"): + output.append(f" 🎯 {repo['relevanceReason']}") if url: output.append(f" 🔗 {url}") @@ -79,6 +102,7 @@ def format_datasources(datasources: list, as_json: bool = False) -> str: output.append(" • Use names with search.py, grep.py, and fetch.py") output.append(" • Workspaces search ALL repos in the workspace") output.append(" • Combine multiple data sources for broader search") + output.append(" • Pass --query 'your task' to list only the relevant sources") output.append("\n📖 Examples:") output.append(" python search.py 'auth logic' my-backend") output.append(" python grep.py 'AuthService' my-backend") @@ -90,20 +114,37 @@ def main(): """CLI interface for listing data sources.""" alive_only = True as_json = False + query = None - for arg in sys.argv[1:]: + args = sys.argv[1:] + i = 0 + while i < len(args): + arg = args[i] if arg == "--all": alive_only = False elif arg == "--json": as_json = True + elif arg == "--query": + if i + 1 >= len(args): + print("❌ Error: --query requires a value", file=sys.stderr) + sys.exit(1) + query = args[i + 1] + i += 1 elif arg == "--help": print(__doc__) sys.exit(0) + i += 1 try: client = CodeAliveClient() - datasources = client.get_datasources(alive_only=alive_only) - print(format_datasources(datasources, as_json)) + result = client.get_datasources(alive_only=alive_only, query=query) + if isinstance(result, dict): + datasources = result.get("dataSources", []) + message = result.get("message", "") + else: + datasources = result + message = "" + print(format_datasources(datasources, as_json, message)) except Exception as e: print(f"❌ Error: {e}", file=sys.stderr) diff --git a/skills/codealive-context-engine/scripts/lib/api_client.py b/skills/codealive-context-engine/scripts/lib/api_client.py index 94d83ab..b7c4df0 100644 --- a/skills/codealive-context-engine/scripts/lib/api_client.py +++ b/skills/codealive-context-engine/scripts/lib/api_client.py @@ -19,6 +19,45 @@ # agents get an actionable error before the network round-trip. _OBJECT_ID_RE = re.compile(r"^[0-9a-fA-F]{24}$") +# Pre-filter scoped candidate count, emitted by the backend only on relevance-filtered +# (query'd) data source listings. +_TOTAL_DATA_SOURCES_HEADER = "X-CodeAlive-Total-Data-Sources" + + +def relevance_message(datasources: List[Dict[str, Any]], total_header: Optional[str]) -> str: + """Build the hint accompanying a query'd (relevance-filtered) data source listing. + + The backend guarantees every relevance-selected item carries a non-empty + ``relevanceReason``, so a query'd response where NO item has one means the filter + did not run (fail-open on error, disabled by config, or an older backend ignoring + ``query``) and the FULL list was returned — the caller must be told, instead of + mistaking the full dump for a relevant shortlist. + """ + filtered = any(ds.get("relevanceReason") for ds in datasources) + if not filtered: + return ( + "Relevance filtering was unavailable for this request (it may have failed or be " + "disabled), so the FULL unfiltered list of data sources is returned." + ) + + shown = len(datasources) + try: + total = int(total_header) + except (TypeError, ValueError): + # Header absent (TypeError on int(None)) or malformed (ValueError). + total = None + if total is not None and total > shown: + return ( + f"{shown} of {total} available data sources are relevant to this query; the other " + f"{total - shown} were omitted. List without a query to get the full list." + ) + if total is not None: + return f"All {total} available data sources are relevant to this query." + return ( + "Only the data sources relevant to this query are shown; non-relevant sources were " + "omitted. List without a query to get the full list." + ) + def format_codealive_error(status: int, body: Any) -> str: """Format a CodeAlive REST API error body into a single human/agent-readable line. @@ -274,8 +313,9 @@ def _make_request( method: str, endpoint: str, params: Optional[Dict[str, Any]] = None, - body: Optional[Dict[str, Any]] = None - ) -> Dict[str, Any]: + body: Optional[Dict[str, Any]] = None, + return_headers: bool = False + ) -> Any: """ Make an HTTP request to the CodeAlive API. @@ -284,9 +324,10 @@ def _make_request( endpoint: API endpoint path params: URL query parameters body: Request body for POST requests + return_headers: If True, return (parsed JSON, response headers dict) instead. Returns: - Parsed JSON response + Parsed JSON response, or (parsed JSON, headers) when return_headers is True """ url = f"{self.base_url}{endpoint}" @@ -312,7 +353,10 @@ def _make_request( try: with urllib.request.urlopen(request, timeout=self.timeout) as response: response_data = response.read().decode("utf-8") - return json.loads(response_data) if response_data else {} + parsed = json.loads(response_data) if response_data else {} + if return_headers: + return parsed, dict(response.headers.items()) + return parsed except urllib.error.HTTPError as e: error_body = e.read() error_msg = format_codealive_error(e.code, error_body) @@ -353,18 +397,35 @@ def _make_request( f"Check your network connection and CODEALIVE_BASE_URL setting." ) - def get_datasources(self, alive_only: bool = True) -> List[Dict[str, Any]]: + def get_datasources( + self, alive_only: bool = True, query: Optional[str] = None + ) -> Any: """ Get available data sources (repositories and workspaces). Args: alive_only: If True, only return data sources ready for use. If False, return all. + query: Optional natural-language task/intent (e.g. "add OAuth to checkout"). When + provided, the backend runs an agentic relevance filter and returns ONLY the data + sources relevant to that intent, each with a `relevanceReason` explaining why. Returns: - List of data source objects with id, name, description, type, etc. + Without query: list of data source objects with id, name, description, type, etc. + With query: dict {"dataSources": [...], "message": "..."} where `message` says whether + sources were omitted as non-relevant (and how many of the total) or that relevance + filtering was unavailable and the FULL list is returned. """ endpoint = "/api/datasources/ready" if alive_only else "/api/datasources/all" - return self._make_request("GET", endpoint) + if not query or not query.strip(): + return self._make_request("GET", endpoint) + + datasources, headers = self._make_request( + "GET", endpoint, params={"query": query}, return_headers=True + ) + return { + "dataSources": datasources, + "message": relevance_message(datasources, headers.get(_TOTAL_DATA_SOURCES_HEADER)), + } def search( self, @@ -581,7 +642,7 @@ def main(): if len(sys.argv) < 2: print("Usage: python api_client.py [args...]") print("Commands:") - print(" datasources [--all]") + print(" datasources [--all] [--query TASK]") print(" search [data_source2...] [--mode auto|fast|deep] [--description-detail short|full]") print(" semantic-search [data_source2...] [--path PATH] [--ext EXT] [--max-results N]") print(" grep-search [data_source2...] [--regex] [--path PATH] [--ext EXT] [--max-results N]") @@ -596,7 +657,14 @@ def main(): try: if command == "datasources": alive_only = "--all" not in sys.argv - result = client.get_datasources(alive_only=alive_only) + query = None + if "--query" in sys.argv: + query_index = sys.argv.index("--query") + if query_index + 1 >= len(sys.argv): + print("Usage: datasources [--all] [--query TASK]") + sys.exit(1) + query = sys.argv[query_index + 1] + result = client.get_datasources(alive_only=alive_only, query=query) print(json.dumps(result, indent=2)) elif command == "search": diff --git a/tests/test_cli_smoke.py b/tests/test_cli_smoke.py index bf38eeb..5625c16 100644 --- a/tests/test_cli_smoke.py +++ b/tests/test_cli_smoke.py @@ -120,6 +120,71 @@ def chat_handler(_request): ] +def test_datasources_script_query_flag_renders_relevance_shortlist(): + def datasources_handler(_request): + return 200, [ + { + "id": "repo-1", + "name": "backend", + "type": "Repository", + "description": "Main backend", + "relevanceReason": "Implements the checkout flow", + } + ], {"X-CodeAlive-Total-Data-Sources": "3"} + + with mock_codealive_server( + {("GET", "/api/datasources/ready?query=add+OAuth+to+checkout"): datasources_handler} + ) as (base_url, requests): + env = { + **os.environ, + "CODEALIVE_API_KEY": "skill-test-key", + "CODEALIVE_BASE_URL": f"{base_url}/api", + } + + formatted = _run("datasources.py", "--query", "add OAuth to checkout", env=env) + as_json = _run("datasources.py", "--query", "add OAuth to checkout", "--json", env=env) + + assert formatted.returncode == 0, formatted.stderr + assert "backend" in formatted.stdout + assert "Implements the checkout flow" in formatted.stdout + assert "1 of 3 available data sources are relevant" in formatted.stdout + assert "the other 2 were omitted" in formatted.stdout + + assert as_json.returncode == 0, as_json.stderr + envelope = json.loads(as_json.stdout) + assert envelope["dataSources"][0]["relevanceReason"] == "Implements the checkout flow" + assert "1 of 3" in envelope["message"] + + assert [request["path"] for request in requests] == [ + "/api/datasources/ready?query=add+OAuth+to+checkout", + "/api/datasources/ready?query=add+OAuth+to+checkout", + ] + + +def test_datasources_script_query_fail_open_warns_full_list(): + with mock_codealive_server( + { + ("GET", "/api/datasources/ready?query=add+OAuth"): ( + 200, + [ + {"id": "repo-1", "name": "backend", "type": "Repository"}, + {"id": "repo-2", "name": "frontend", "type": "Repository"}, + ], + ) + } + ) as (base_url, _requests): + env = { + **os.environ, + "CODEALIVE_API_KEY": "skill-test-key", + "CODEALIVE_BASE_URL": f"{base_url}/api", + } + + result = _run("datasources.py", "--query", "add OAuth", env=env) + + assert result.returncode == 0, result.stderr + assert "FULL unfiltered list" in result.stdout + + def test_relationships_script_works_against_mock_backend(): def relationships_handler(request): body = json.loads(request["body"]) diff --git a/tests/test_setup_and_client.py b/tests/test_setup_and_client.py index fc745ef..1e0cf32 100644 --- a/tests/test_setup_and_client.py +++ b/tests/test_setup_and_client.py @@ -72,6 +72,66 @@ def test_api_client_normalizes_base_url_and_calls_ready_endpoint(): assert requests[0]["headers"]["Authorization"] == "Bearer skill-test-key" +def test_get_datasources_with_query_sends_param_and_reports_omitted_count(): + def datasources_handler(_request): + return 200, [ + { + "id": "repo-1", + "name": "backend", + "type": "Repository", + "relevanceReason": "Implements the checkout flow", + } + ], {"X-CodeAlive-Total-Data-Sources": "3"} + + with mock_codealive_server( + {("GET", "/api/datasources/ready?query=add+OAuth"): datasources_handler} + ) as (base_url, requests): + client = CodeAliveClient(api_key="skill-test-key", base_url=base_url) + result = client.get_datasources(query="add OAuth") + + assert requests[0]["path"] == "/api/datasources/ready?query=add+OAuth" + assert result["dataSources"][0]["relevanceReason"] == "Implements the checkout flow" + assert "1 of 3 available data sources are relevant" in result["message"] + assert "the other 2 were omitted" in result["message"] + + +def test_get_datasources_query_fail_open_warns_full_list_returned(): + # No item carries relevanceReason and no total header: the backend filter did not + # run (fail-open / disabled / older backend) and returned the full list. + with mock_codealive_server( + { + ("GET", "/api/datasources/ready?query=add+OAuth"): ( + 200, + [ + {"id": "repo-1", "name": "backend", "type": "Repository"}, + {"id": "repo-2", "name": "frontend", "type": "Repository"}, + ], + ) + } + ) as (base_url, _requests): + client = CodeAliveClient(api_key="skill-test-key", base_url=base_url) + result = client.get_datasources(query="add OAuth") + + assert len(result["dataSources"]) == 2 + assert "FULL unfiltered list" in result["message"] + + +def test_get_datasources_blank_query_behaves_like_no_query(): + with mock_codealive_server( + { + ("GET", "/api/datasources/ready"): ( + 200, + [{"id": "repo-1", "name": "backend", "type": "Repository"}], + ) + } + ) as (base_url, requests): + client = CodeAliveClient(api_key="skill-test-key", base_url=base_url) + result = client.get_datasources(query=" ") + + assert result == [{"id": "repo-1", "name": "backend", "type": "Repository"}] + assert requests[0]["path"] == "/api/datasources/ready" + + def test_api_client_search_fetch_and_chat_use_expected_endpoints(): def search_handler(request): assert "Query=auth" in request["path"] From 1d549833a1b171c966a307279e68fdad1ab9dcd6 Mon Sep 17 00:00:00 2001 From: sciapanCA Date: Tue, 9 Jun 2026 21:30:48 +0200 Subject: [PATCH 2/2] Handle confident-empty relevance verdicts and header casing drift --- .../scripts/lib/api_client.py | 47 +++++++++++++------ tests/test_setup_and_client.py | 35 +++++++++++++- 2 files changed, 67 insertions(+), 15 deletions(-) diff --git a/skills/codealive-context-engine/scripts/lib/api_client.py b/skills/codealive-context-engine/scripts/lib/api_client.py index b7c4df0..705481b 100644 --- a/skills/codealive-context-engine/scripts/lib/api_client.py +++ b/skills/codealive-context-engine/scripts/lib/api_client.py @@ -20,32 +20,49 @@ _OBJECT_ID_RE = re.compile(r"^[0-9a-fA-F]{24}$") # Pre-filter scoped candidate count, emitted by the backend only on relevance-filtered -# (query'd) data source listings. -_TOTAL_DATA_SOURCES_HEADER = "X-CodeAlive-Total-Data-Sources" +# (query'd) data source listings. Lowercase because _make_request lowercases header +# keys (proxies/origins may change response-header casing; HTTP headers are +# case-insensitive per RFC 9110). +_TOTAL_DATA_SOURCES_HEADER = "x-codealive-total-data-sources" def relevance_message(datasources: List[Dict[str, Any]], total_header: Optional[str]) -> str: """Build the hint accompanying a query'd (relevance-filtered) data source listing. The backend guarantees every relevance-selected item carries a non-empty - ``relevanceReason``, so a query'd response where NO item has one means the filter - did not run (fail-open on error, disabled by config, or an older backend ignoring - ``query``) and the FULL list was returned — the caller must be told, instead of - mistaking the full dump for a relevant shortlist. - """ - filtered = any(ds.get("relevanceReason") for ds in datasources) - if not filtered: - return ( - "Relevance filtering was unavailable for this request (it may have failed or be " - "disabled), so the FULL unfiltered list of data sources is returned." - ) + ``relevanceReason``, so a NON-EMPTY query'd response where no item has one means + the filter did not run (fail-open on error, disabled by config, or an older + backend ignoring ``query``) and the FULL list was returned — the caller must be + told, instead of mistaking the full dump for a relevant shortlist. + An EMPTY response is never fail-open output when the total header reports + available candidates (fail-open returns the full, hence non-empty, list): it is + the filter's confident-empty verdict — it ran and matched nothing. + + The total header is NOT a filter-success signal: the backend emits it on every + query'd response, including fail-open. + """ shown = len(datasources) try: total = int(total_header) except (TypeError, ValueError): # Header absent (TypeError on int(None)) or malformed (ValueError). total = None + + if shown == 0: + if total is not None and total > 0: + return ( + f"None of the {total} available data sources are relevant to this query. " + "List without a query to get the full list." + ) + return "No data sources are available." + + filtered = any(ds.get("relevanceReason") for ds in datasources) + if not filtered: + return ( + "Relevance filtering was unavailable for this request (it may have failed or be " + "disabled), so the FULL unfiltered list of data sources is returned." + ) if total is not None and total > shown: return ( f"{shown} of {total} available data sources are relevant to this query; the other " @@ -355,7 +372,9 @@ def _make_request( response_data = response.read().decode("utf-8") parsed = json.loads(response_data) if response_data else {} if return_headers: - return parsed, dict(response.headers.items()) + # Lowercase keys: header casing is not guaranteed end-to-end + # (RFC 9110 §5.1), and a plain dict lookup is case-sensitive. + return parsed, {k.lower(): v for k, v in response.headers.items()} return parsed except urllib.error.HTTPError as e: error_body = e.read() diff --git a/tests/test_setup_and_client.py b/tests/test_setup_and_client.py index 1e0cf32..a11c1b3 100644 --- a/tests/test_setup_and_client.py +++ b/tests/test_setup_and_client.py @@ -81,7 +81,7 @@ def datasources_handler(_request): "type": "Repository", "relevanceReason": "Implements the checkout flow", } - ], {"X-CodeAlive-Total-Data-Sources": "3"} + ], {"x-codealive-total-data-sources": "3"} # lowercase: proxies may normalize casing with mock_codealive_server( {("GET", "/api/datasources/ready?query=add+OAuth"): datasources_handler} @@ -116,6 +116,39 @@ def test_get_datasources_query_fail_open_warns_full_list_returned(): assert "FULL unfiltered list" in result["message"] +def test_get_datasources_query_confident_empty_reports_nothing_relevant(): + # Empty list + total header: the filter ran and confidently matched nothing. + # Must NOT be mistaken for fail-open (fail-open returns the full, non-empty list). + def datasources_handler(_request): + return 200, [], {"X-CodeAlive-Total-Data-Sources": "3"} + + with mock_codealive_server( + {("GET", "/api/datasources/ready?query=add+OAuth"): datasources_handler} + ) as (base_url, _requests): + client = CodeAliveClient(api_key="skill-test-key", base_url=base_url) + result = client.get_datasources(query="add OAuth") + + assert result["dataSources"] == [] + assert "None of the 3 available data sources are relevant" in result["message"] + assert "List without a query" in result["message"] + + +def test_get_datasources_query_empty_org_reports_no_sources(): + # Empty list and total header reports zero candidates: the org simply has no + # data sources — not a relevance verdict, not a filter failure. + def datasources_handler(_request): + return 200, [], {"X-CodeAlive-Total-Data-Sources": "0"} + + with mock_codealive_server( + {("GET", "/api/datasources/ready?query=add+OAuth"): datasources_handler} + ) as (base_url, _requests): + client = CodeAliveClient(api_key="skill-test-key", base_url=base_url) + result = client.get_datasources(query="add OAuth") + + assert result["dataSources"] == [] + assert result["message"] == "No data sources are available." + + def test_get_datasources_blank_query_behaves_like_no_query(): with mock_codealive_server( {