From 6f4511f302665db1f595bc7b91724361689f338d Mon Sep 17 00:00:00 2001
From: sciapanCA <sciapan@codealive.dev>
Date: Tue, 9 Jun 2026 21:07:39 +0200
Subject: [PATCH 1/2] Add datasource relevance filter support (--query)

Mirror the backend/MCP datasource relevance filter in the skill:
- get_datasources() accepts an optional natural-language query, sends it
  as ?query=, parses the X-CodeAlive-Total-Data-Sources header, and
  returns a {dataSources, message} envelope with fail-open detection
- datasources.py gains a --query flag, renders relevanceReason per
  source and the omitted-count / fail-open message
- SKILL.md, workflows reference, and the context-explorer agent now
  recommend passing the user's task as --query
- Bump plugin version to 2.1.0

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .claude-plugin/plugin.json                    |  2 +-
 agents/codealive-context-explorer.md          | 11 ++-
 skills/codealive-context-engine/SKILL.md      | 20 ++++-
 .../references/workflows.md                   | 10 ++-
 .../scripts/datasources.py                    | 57 ++++++++++--
 .../scripts/lib/api_client.py                 | 86 +++++++++++++++++--
 tests/test_cli_smoke.py                       | 65 ++++++++++++++
 tests/test_setup_and_client.py                | 60 +++++++++++++
 8 files changed, 283 insertions(+), 28 deletions(-)
diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
index 4fe8224..1a7cdb5 100644
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "codealive",
   "description": "CodeAlive context engine for semantic code search and AI-powered codebase Q&A. Enables AI coding agents to understand entire codebases beyond just open files — search across all indexed repositories, trace cross-service dependencies, discover usage patterns, and get synthesized answers to architectural questions. Includes a lightweight code exploration subagent, authentication hooks, and multiple search modes (fast lexical, semantic, and deep cross-cutting). Works standalone or alongside the CodeAlive MCP server for direct tool access via the Model Context Protocol.",
-  "version": "2.0.9",
+  "version": "2.1.0",
   "author": {
     "name": "CodeAlive AI",
     "email": "hello@codealive.ai"
diff --git a/agents/codealive-context-explorer.md b/agents/codealive-context-explorer.md
index a0a9740..f338ebe 100644
--- a/agents/codealive-context-explorer.md
+++ b/agents/codealive-context-explorer.md
@@ -16,7 +16,7 @@ You are a code exploration specialist. **Your default tool is CodeAlive — not
 Unless the request is unambiguously a local-only file lookup ("read line 42 of foo.ts", "is bar.py in this repo"), your first turn MUST include both of these calls before any answer:
 
 ```bash
-python scripts/datasources.py
+python scripts/datasources.py --query "<the user's question or task>"
 python scripts/search.py "<question paraphrased as a concept query>" <data_source>
 ```
 
@@ -28,9 +28,12 @@ The scripts directory is relative to the skill location. If a path fails, fall b
 
 ### 1. List data sources — run FIRST every session
 ```bash
-python scripts/datasources.py
+python scripts/datasources.py --query "<the user's question or task>"
 ```
-Without this you do not know what to search against. Instant, free, cheap.
+Without this you do not know what to search against. Pass the user's question as `--query` so
+the backend returns only the relevant sources, each with a `relevanceReason`. The output tells
+you when sources were omitted, and when filtering was unavailable (the full list is returned
+instead — fail-open). Omit `--query` only when the user asks for the complete inventory.
 
 ### 2. Semantic search — your default discovery tool
 ```bash
@@ -64,7 +67,7 @@ Use after `search.py` or `fetch.py` to expand a call graph, inheritance, or symb
 
 Standard loop, in order:
 
-1. **`datasources.py`** — every session, no exceptions.
+1. **`datasources.py --query "<user's task>"`** — every session, no exceptions. The relevance-filtered shortlist tells you what to search against; if a source you expected is missing, rerun without `--query` to see the full list.
 2. **`search.py`** with the main concept — every session, no exceptions. Run it even when you have a guess; the search confirms or refutes it with real evidence.
 3. **`grep.py`** for specific identifiers, error messages, or config keys surfaced in step 2.
 4. **`fetch.py`** on the most relevant identifiers (descriptions are triage pointers only — never reason from them).
diff --git a/skills/codealive-context-engine/SKILL.md b/skills/codealive-context-engine/SKILL.md
index 466833d..0f3c9d7 100644
--- a/skills/codealive-context-engine/SKILL.md
+++ b/skills/codealive-context-engine/SKILL.md
@@ -37,7 +37,7 @@ Do NOT retry the failed script until setup completes successfully.
 
 | Tool | Script | Speed | Cost | Best For |
 |------|--------|-------|------|----------|
-| **List Data Sources** | `datasources.py` | Instant | Free | Discovering indexed repos and workspaces |
+| **List Data Sources** | `datasources.py` | Instant | Free | Discovering indexed repos and workspaces. With `--query "task"`, runs an AI relevance filter (low cost, not instant) returning only the relevant sources |
 | **Semantic Search** | `search.py` | Fast | Low | Default discovery — finds code by meaning (concepts, behavior, architecture) |
 | **Grep Search** | `grep.py` | Fast | Low | Finds code containing a specific string or regex (identifiers, literals, patterns) |
 | **Fetch Artifacts** | `fetch.py` | Fast | Low | Retrieving full content; function-like artifacts also include up to 3 outgoing/incoming calls as a preview |
@@ -106,9 +106,13 @@ logic.
 ### 1. Discover what's indexed
 
 ```bash
-python scripts/datasources.py
+python scripts/datasources.py --query "the user's task in natural language"
 ```
 
+Recommended: pass the user's task as `--query` so the backend returns only the relevant
+data sources, each with a `relevanceReason`. Omit `--query` to list everything (instant,
+no AI filtering).
+
 ### 2. Search for code (fast, cheap)
 
 ```bash
@@ -151,11 +155,21 @@ python scripts/chat.py "What about security considerations?" --continue CONV_ID
 ### `datasources.py` — List Data Sources
 
 ```bash
-python scripts/datasources.py              # Ready-to-use sources
+python scripts/datasources.py --query "add OAuth to checkout"  # Only sources relevant to a task (recommended)
+python scripts/datasources.py              # Ready-to-use sources (full list)
 python scripts/datasources.py --all        # All (including processing)
 python scripts/datasources.py --json       # JSON output
 ```
 
+| Option | Description |
+|--------|-------------|
+| `--query "TASK"` | The user's task/intent in natural language. The backend runs an AI relevance filter and returns only the relevant sources, each with a `relevanceReason`. Recommended whenever you know what the user is trying to accomplish |
+| `--all` | Include sources still processing |
+| `--json` | Raw JSON output (with `--query`: `{"dataSources": [...], "message": "..."}`) |
+
+**Fail-open:** if relevance filtering is unavailable, the FULL list is returned and the
+output says so — check the message before treating the result as a relevant shortlist.
+
 ### `search.py` — Semantic Code Search (default discovery tool)
 
 The default starting point. Finds code by WHAT it does — concepts, behavior,
diff --git a/skills/codealive-context-engine/references/workflows.md b/skills/codealive-context-engine/references/workflows.md
index 789d5f9..d85a4b3 100644
--- a/skills/codealive-context-engine/references/workflows.md
+++ b/skills/codealive-context-engine/references/workflows.md
@@ -20,9 +20,13 @@ Complete workflows for common code exploration scenarios using CodeAlive.
 
 ### Step 1: Discover Available Code
 ```bash
-python datasources.py
+python datasources.py --query "your task in natural language"
 ```
 
+Pass your task as `--query` to get only the relevant data sources, each with a
+`relevanceReason` (recommended when you know the goal). Run plain `python datasources.py`
+for the complete inventory.
+
 Review output to understand:
 - What repositories are indexed
 - What workspaces group related repos
@@ -287,8 +291,8 @@ python grep.py "useMemo|useCallback|React.memo" workspace:all-frontend --regex
 ### Day 1: Get Overview
 
 ```bash
-# Discover what's indexed
-python datasources.py
+# Discover what's indexed (relevance-filtered to the onboarding goal)
+python datasources.py --query "onboard to the new-service codebase"
 
 # Find entry points and main features
 python search.py "main application entry point, startup initialization" new-service
diff --git a/skills/codealive-context-engine/scripts/datasources.py b/skills/codealive-context-engine/scripts/datasources.py
index 477cc1f..91413be 100755
--- a/skills/codealive-context-engine/scripts/datasources.py
+++ b/skills/codealive-context-engine/scripts/datasources.py
@@ -6,11 +6,16 @@
 Includes current project repos, dependencies, libraries, and organizational codebases.
 
 Usage:
-    python datasources.py              # Show ready-to-use data sources
-    python datasources.py --all        # Show all data sources (including processing)
-    python datasources.py --json       # Output as JSON
+    python datasources.py                  # Show ready-to-use data sources
+    python datasources.py --query "TASK"   # Show only sources relevant to a task (recommended)
+    python datasources.py --all            # Show all data sources (including processing)
+    python datasources.py --json           # Output as JSON
 
 Examples:
+    # RECOMMENDED when you know the task: only sources relevant to it, each with a
+    # relevanceReason explaining the match
+    python datasources.py --query "add OAuth to the checkout flow"
+
     # List ready data sources
     python datasources.py
 
@@ -19,6 +24,10 @@
 
     # Get JSON output for parsing
     python datasources.py --json
+
+Note:
+    --query runs an AI relevance filter on the backend. It fails open: if filtering is
+    unavailable, the FULL list is returned and the output says so.
 """
 
 import sys
@@ -31,17 +40,27 @@
 from api_client import CodeAliveClient
 
 
-def format_datasources(datasources: list, as_json: bool = False) -> str:
-    """Format data sources for display."""
+def format_datasources(datasources: list, as_json: bool = False, message: str = "") -> str:
+    """Format data sources for display.
+
+    `message` is the relevance hint accompanying a --query'd listing: how many sources
+    were omitted as non-relevant, or that filtering was unavailable and the list is full.
+    """
     if as_json:
+        if message:
+            return json.dumps({"dataSources": datasources, "message": message}, indent=2)
         return json.dumps(datasources, indent=2)
 
     if not datasources:
+        if message:
+            return f"No data sources matched.\nℹ️  {message}"
         return "No data sources found.\nAdd repositories at https://app.codealive.ai"
 
     output = []
     output.append(f"\n📚 Available Data Sources ({len(datasources)} total)\n")
     output.append("="*80)
+    if message:
+        output.append(f"\nℹ️  {message}")
 
     # Group by type
     repos = [ds for ds in datasources if ds.get("type") == "Repository"]
@@ -58,6 +77,8 @@ def format_datasources(datasources: list, as_json: bool = False) -> str:
             status = f" [{state}]" if state and state != "Alive" else ""
             output.append(f"\n  📁 {name}{status}")
             output.append(f"     {desc}")
+            if ws.get("relevanceReason"):
+                output.append(f"     🎯 {ws['relevanceReason']}")
 
     if repos:
         output.append("\n\n📦 REPOSITORIES")
@@ -71,6 +92,8 @@ def format_datasources(datasources: list, as_json: bool = False) -> str:
             status = f" [{state}]" if state and state != "Alive" else ""
             output.append(f"\n  📄 {name}{status}")
             output.append(f"     {desc}")
+            if repo.get("relevanceReason"):
+                output.append(f"     🎯 {repo['relevanceReason']}")
             if url:
                 output.append(f"     🔗 {url}")
 
@@ -79,6 +102,7 @@ def format_datasources(datasources: list, as_json: bool = False) -> str:
     output.append("   • Use names with search.py, grep.py, and fetch.py")
     output.append("   • Workspaces search ALL repos in the workspace")
     output.append("   • Combine multiple data sources for broader search")
+    output.append("   • Pass --query 'your task' to list only the relevant sources")
     output.append("\n📖 Examples:")
     output.append("   python search.py 'auth logic' my-backend")
     output.append("   python grep.py 'AuthService' my-backend")
@@ -90,20 +114,37 @@ def main():
     """CLI interface for listing data sources."""
     alive_only = True
     as_json = False
+    query = None
 
-    for arg in sys.argv[1:]:
+    args = sys.argv[1:]
+    i = 0
+    while i < len(args):
+        arg = args[i]
         if arg == "--all":
             alive_only = False
         elif arg == "--json":
             as_json = True
+        elif arg == "--query":
+            if i + 1 >= len(args):
+                print("❌ Error: --query requires a value", file=sys.stderr)
+                sys.exit(1)
+            query = args[i + 1]
+            i += 1
         elif arg == "--help":
             print(__doc__)
             sys.exit(0)
+        i += 1
 
     try:
         client = CodeAliveClient()
-        datasources = client.get_datasources(alive_only=alive_only)
-        print(format_datasources(datasources, as_json))
+        result = client.get_datasources(alive_only=alive_only, query=query)
+        if isinstance(result, dict):
+            datasources = result.get("dataSources", [])
+            message = result.get("message", "")
+        else:
+            datasources = result
+            message = ""
+        print(format_datasources(datasources, as_json, message))
 
     except Exception as e:
         print(f"❌ Error: {e}", file=sys.stderr)
diff --git a/skills/codealive-context-engine/scripts/lib/api_client.py b/skills/codealive-context-engine/scripts/lib/api_client.py
index 94d83ab..b7c4df0 100644
--- a/skills/codealive-context-engine/scripts/lib/api_client.py
+++ b/skills/codealive-context-engine/scripts/lib/api_client.py
@@ -19,6 +19,45 @@
 # agents get an actionable error before the network round-trip.
 _OBJECT_ID_RE = re.compile(r"^[0-9a-fA-F]{24}$")
 
+# Pre-filter scoped candidate count, emitted by the backend only on relevance-filtered
+# (query'd) data source listings.
+_TOTAL_DATA_SOURCES_HEADER = "X-CodeAlive-Total-Data-Sources"
+
+
+def relevance_message(datasources: List[Dict[str, Any]], total_header: Optional[str]) -> str:
+    """Build the hint accompanying a query'd (relevance-filtered) data source listing.
+
+    The backend guarantees every relevance-selected item carries a non-empty
+    ``relevanceReason``, so a query'd response where NO item has one means the filter
+    did not run (fail-open on error, disabled by config, or an older backend ignoring
+    ``query``) and the FULL list was returned — the caller must be told, instead of
+    mistaking the full dump for a relevant shortlist.
+    """
+    filtered = any(ds.get("relevanceReason") for ds in datasources)
+    if not filtered:
+        return (
+            "Relevance filtering was unavailable for this request (it may have failed or be "
+            "disabled), so the FULL unfiltered list of data sources is returned."
+        )
+
+    shown = len(datasources)
+    try:
+        total = int(total_header)
+    except (TypeError, ValueError):
+        # Header absent (TypeError on int(None)) or malformed (ValueError).
+        total = None
+    if total is not None and total > shown:
+        return (
+            f"{shown} of {total} available data sources are relevant to this query; the other "
+            f"{total - shown} were omitted. List without a query to get the full list."
+        )
+    if total is not None:
+        return f"All {total} available data sources are relevant to this query."
+    return (
+        "Only the data sources relevant to this query are shown; non-relevant sources were "
+        "omitted. List without a query to get the full list."
+    )
+
 
 def format_codealive_error(status: int, body: Any) -> str:
     """Format a CodeAlive REST API error body into a single human/agent-readable line.
@@ -274,8 +313,9 @@ def _make_request(
         method: str,
         endpoint: str,
         params: Optional[Dict[str, Any]] = None,
-        body: Optional[Dict[str, Any]] = None
-    ) -> Dict[str, Any]:
+        body: Optional[Dict[str, Any]] = None,
+        return_headers: bool = False
+    ) -> Any:
         """
         Make an HTTP request to the CodeAlive API.
 
@@ -284,9 +324,10 @@ def _make_request(
             endpoint: API endpoint path
             params: URL query parameters
             body: Request body for POST requests
+            return_headers: If True, return (parsed JSON, response headers dict) instead.
 
         Returns:
-            Parsed JSON response
+            Parsed JSON response, or (parsed JSON, headers) when return_headers is True
         """
         url = f"{self.base_url}{endpoint}"
 
@@ -312,7 +353,10 @@ def _make_request(
         try:
             with urllib.request.urlopen(request, timeout=self.timeout) as response:
                 response_data = response.read().decode("utf-8")
-                return json.loads(response_data) if response_data else {}
+                parsed = json.loads(response_data) if response_data else {}
+                if return_headers:
+                    return parsed, dict(response.headers.items())
+                return parsed
         except urllib.error.HTTPError as e:
             error_body = e.read()
             error_msg = format_codealive_error(e.code, error_body)
@@ -353,18 +397,35 @@ def _make_request(
                 f"Check your network connection and CODEALIVE_BASE_URL setting."
             )
 
-    def get_datasources(self, alive_only: bool = True) -> List[Dict[str, Any]]:
+    def get_datasources(
+        self, alive_only: bool = True, query: Optional[str] = None
+    ) -> Any:
         """
         Get available data sources (repositories and workspaces).
 
         Args:
             alive_only: If True, only return data sources ready for use. If False, return all.
+            query: Optional natural-language task/intent (e.g. "add OAuth to checkout"). When
+                provided, the backend runs an agentic relevance filter and returns ONLY the data
+                sources relevant to that intent, each with a `relevanceReason` explaining why.
 
         Returns:
-            List of data source objects with id, name, description, type, etc.
+            Without query: list of data source objects with id, name, description, type, etc.
+            With query: dict {"dataSources": [...], "message": "..."} where `message` says whether
+            sources were omitted as non-relevant (and how many of the total) or that relevance
+            filtering was unavailable and the FULL list is returned.
         """
         endpoint = "/api/datasources/ready" if alive_only else "/api/datasources/all"
-        return self._make_request("GET", endpoint)
+        if not query or not query.strip():
+            return self._make_request("GET", endpoint)
+
+        datasources, headers = self._make_request(
+            "GET", endpoint, params={"query": query}, return_headers=True
+        )
+        return {
+            "dataSources": datasources,
+            "message": relevance_message(datasources, headers.get(_TOTAL_DATA_SOURCES_HEADER)),
+        }
 
     def search(
         self,
@@ -581,7 +642,7 @@ def main():
     if len(sys.argv) < 2:
         print("Usage: python api_client.py <command> [args...]")
         print("Commands:")
-        print("  datasources [--all]")
+        print("  datasources [--all] [--query TASK]")
         print("  search <query> <data_source1> [data_source2...] [--mode auto|fast|deep] [--description-detail short|full]")
         print("  semantic-search <query> <data_source1> [data_source2...] [--path PATH] [--ext EXT] [--max-results N]")
         print("  grep-search <query> <data_source1> [data_source2...] [--regex] [--path PATH] [--ext EXT] [--max-results N]")
@@ -596,7 +657,14 @@ def main():
     try:
         if command == "datasources":
             alive_only = "--all" not in sys.argv
-            result = client.get_datasources(alive_only=alive_only)
+            query = None
+            if "--query" in sys.argv:
+                query_index = sys.argv.index("--query")
+                if query_index + 1 >= len(sys.argv):
+                    print("Usage: datasources [--all] [--query TASK]")
+                    sys.exit(1)
+                query = sys.argv[query_index + 1]
+            result = client.get_datasources(alive_only=alive_only, query=query)
             print(json.dumps(result, indent=2))
 
         elif command == "search":
diff --git a/tests/test_cli_smoke.py b/tests/test_cli_smoke.py
index bf38eeb..5625c16 100644
--- a/tests/test_cli_smoke.py
+++ b/tests/test_cli_smoke.py
@@ -120,6 +120,71 @@ def chat_handler(_request):
     ]
 
 
+def test_datasources_script_query_flag_renders_relevance_shortlist():
+    def datasources_handler(_request):
+        return 200, [
+            {
+                "id": "repo-1",
+                "name": "backend",
+                "type": "Repository",
+                "description": "Main backend",
+                "relevanceReason": "Implements the checkout flow",
+            }
+        ], {"X-CodeAlive-Total-Data-Sources": "3"}
+
+    with mock_codealive_server(
+        {("GET", "/api/datasources/ready?query=add+OAuth+to+checkout"): datasources_handler}
+    ) as (base_url, requests):
+        env = {
+            **os.environ,
+            "CODEALIVE_API_KEY": "skill-test-key",
+            "CODEALIVE_BASE_URL": f"{base_url}/api",
+        }
+
+        formatted = _run("datasources.py", "--query", "add OAuth to checkout", env=env)
+        as_json = _run("datasources.py", "--query", "add OAuth to checkout", "--json", env=env)
+
+    assert formatted.returncode == 0, formatted.stderr
+    assert "backend" in formatted.stdout
+    assert "Implements the checkout flow" in formatted.stdout
+    assert "1 of 3 available data sources are relevant" in formatted.stdout
+    assert "the other 2 were omitted" in formatted.stdout
+
+    assert as_json.returncode == 0, as_json.stderr
+    envelope = json.loads(as_json.stdout)
+    assert envelope["dataSources"][0]["relevanceReason"] == "Implements the checkout flow"
+    assert "1 of 3" in envelope["message"]
+
+    assert [request["path"] for request in requests] == [
+        "/api/datasources/ready?query=add+OAuth+to+checkout",
+        "/api/datasources/ready?query=add+OAuth+to+checkout",
+    ]
+
+
+def test_datasources_script_query_fail_open_warns_full_list():
+    with mock_codealive_server(
+        {
+            ("GET", "/api/datasources/ready?query=add+OAuth"): (
+                200,
+                [
+                    {"id": "repo-1", "name": "backend", "type": "Repository"},
+                    {"id": "repo-2", "name": "frontend", "type": "Repository"},
+                ],
+            )
+        }
+    ) as (base_url, _requests):
+        env = {
+            **os.environ,
+            "CODEALIVE_API_KEY": "skill-test-key",
+            "CODEALIVE_BASE_URL": f"{base_url}/api",
+        }
+
+        result = _run("datasources.py", "--query", "add OAuth", env=env)
+
+    assert result.returncode == 0, result.stderr
+    assert "FULL unfiltered list" in result.stdout
+
+
 def test_relationships_script_works_against_mock_backend():
     def relationships_handler(request):
         body = json.loads(request["body"])
diff --git a/tests/test_setup_and_client.py b/tests/test_setup_and_client.py
index fc745ef..1e0cf32 100644
--- a/tests/test_setup_and_client.py
+++ b/tests/test_setup_and_client.py
@@ -72,6 +72,66 @@ def test_api_client_normalizes_base_url_and_calls_ready_endpoint():
     assert requests[0]["headers"]["Authorization"] == "Bearer skill-test-key"
 
 
+def test_get_datasources_with_query_sends_param_and_reports_omitted_count():
+    def datasources_handler(_request):
+        return 200, [
+            {
+                "id": "repo-1",
+                "name": "backend",
+                "type": "Repository",
+                "relevanceReason": "Implements the checkout flow",
+            }
+        ], {"X-CodeAlive-Total-Data-Sources": "3"}
+
+    with mock_codealive_server(
+        {("GET", "/api/datasources/ready?query=add+OAuth"): datasources_handler}
+    ) as (base_url, requests):
+        client = CodeAliveClient(api_key="skill-test-key", base_url=base_url)
+        result = client.get_datasources(query="add OAuth")
+
+    assert requests[0]["path"] == "/api/datasources/ready?query=add+OAuth"
+    assert result["dataSources"][0]["relevanceReason"] == "Implements the checkout flow"
+    assert "1 of 3 available data sources are relevant" in result["message"]
+    assert "the other 2 were omitted" in result["message"]
+
+
+def test_get_datasources_query_fail_open_warns_full_list_returned():
+    # No item carries relevanceReason and no total header: the backend filter did not
+    # run (fail-open / disabled / older backend) and returned the full list.
+    with mock_codealive_server(
+        {
+            ("GET", "/api/datasources/ready?query=add+OAuth"): (
+                200,
+                [
+                    {"id": "repo-1", "name": "backend", "type": "Repository"},
+                    {"id": "repo-2", "name": "frontend", "type": "Repository"},
+                ],
+            )
+        }
+    ) as (base_url, _requests):
+        client = CodeAliveClient(api_key="skill-test-key", base_url=base_url)
+        result = client.get_datasources(query="add OAuth")
+
+    assert len(result["dataSources"]) == 2
+    assert "FULL unfiltered list" in result["message"]
+
+
+def test_get_datasources_blank_query_behaves_like_no_query():
+    with mock_codealive_server(
+        {
+            ("GET", "/api/datasources/ready"): (
+                200,
+                [{"id": "repo-1", "name": "backend", "type": "Repository"}],
+            )
+        }
+    ) as (base_url, requests):
+        client = CodeAliveClient(api_key="skill-test-key", base_url=base_url)
+        result = client.get_datasources(query="   ")
+
+    assert result == [{"id": "repo-1", "name": "backend", "type": "Repository"}]
+    assert requests[0]["path"] == "/api/datasources/ready"
+
+
 def test_api_client_search_fetch_and_chat_use_expected_endpoints():
     def search_handler(request):
         assert "Query=auth" in request["path"]

From 1d549833a1b171c966a307279e68fdad1ab9dcd6 Mon Sep 17 00:00:00 2001
From: sciapanCA <sciapan@codealive.dev>
Date: Tue, 9 Jun 2026 21:30:48 +0200
Subject: [PATCH 2/2] Handle confident-empty relevance verdicts and header
 casing drift

---
 .../scripts/lib/api_client.py                 | 47 +++++++++++++------
 tests/test_setup_and_client.py                | 35 +++++++++++++-
 2 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/skills/codealive-context-engine/scripts/lib/api_client.py b/skills/codealive-context-engine/scripts/lib/api_client.py
index b7c4df0..705481b 100644
--- a/skills/codealive-context-engine/scripts/lib/api_client.py
+++ b/skills/codealive-context-engine/scripts/lib/api_client.py
@@ -20,32 +20,49 @@
 _OBJECT_ID_RE = re.compile(r"^[0-9a-fA-F]{24}$")
 
 # Pre-filter scoped candidate count, emitted by the backend only on relevance-filtered
-# (query'd) data source listings.
-_TOTAL_DATA_SOURCES_HEADER = "X-CodeAlive-Total-Data-Sources"
+# (query'd) data source listings. Lowercase because _make_request lowercases header
+# keys (proxies/origins may change response-header casing; HTTP headers are
+# case-insensitive per RFC 9110).
+_TOTAL_DATA_SOURCES_HEADER = "x-codealive-total-data-sources"
 
 
 def relevance_message(datasources: List[Dict[str, Any]], total_header: Optional[str]) -> str:
     """Build the hint accompanying a query'd (relevance-filtered) data source listing.
 
     The backend guarantees every relevance-selected item carries a non-empty
-    ``relevanceReason``, so a query'd response where NO item has one means the filter
-    did not run (fail-open on error, disabled by config, or an older backend ignoring
-    ``query``) and the FULL list was returned — the caller must be told, instead of
-    mistaking the full dump for a relevant shortlist.
-    """
-    filtered = any(ds.get("relevanceReason") for ds in datasources)
-    if not filtered:
-        return (
-            "Relevance filtering was unavailable for this request (it may have failed or be "
-            "disabled), so the FULL unfiltered list of data sources is returned."
-        )
+    ``relevanceReason``, so a NON-EMPTY query'd response where no item has one means
+    the filter did not run (fail-open on error, disabled by config, or an older
+    backend ignoring ``query``) and the FULL list was returned — the caller must be
+    told, instead of mistaking the full dump for a relevant shortlist.
 
+    An EMPTY response is never fail-open output when the total header reports
+    available candidates (fail-open returns the full, hence non-empty, list): it is
+    the filter's confident-empty verdict — it ran and matched nothing.
+
+    The total header is NOT a filter-success signal: the backend emits it on every
+    query'd response, including fail-open.
+    """
     shown = len(datasources)
     try:
         total = int(total_header)
     except (TypeError, ValueError):
         # Header absent (TypeError on int(None)) or malformed (ValueError).
         total = None
+
+    if shown == 0:
+        if total is not None and total > 0:
+            return (
+                f"None of the {total} available data sources are relevant to this query. "
+                "List without a query to get the full list."
+            )
+        return "No data sources are available."
+
+    filtered = any(ds.get("relevanceReason") for ds in datasources)
+    if not filtered:
+        return (
+            "Relevance filtering was unavailable for this request (it may have failed or be "
+            "disabled), so the FULL unfiltered list of data sources is returned."
+        )
     if total is not None and total > shown:
         return (
             f"{shown} of {total} available data sources are relevant to this query; the other "
@@ -355,7 +372,9 @@ def _make_request(
                 response_data = response.read().decode("utf-8")
                 parsed = json.loads(response_data) if response_data else {}
                 if return_headers:
-                    return parsed, dict(response.headers.items())
+                    # Lowercase keys: header casing is not guaranteed end-to-end
+                    # (RFC 9110 §5.1), and a plain dict lookup is case-sensitive.
+                    return parsed, {k.lower(): v for k, v in response.headers.items()}
                 return parsed
         except urllib.error.HTTPError as e:
             error_body = e.read()
diff --git a/tests/test_setup_and_client.py b/tests/test_setup_and_client.py
index 1e0cf32..a11c1b3 100644
--- a/tests/test_setup_and_client.py
+++ b/tests/test_setup_and_client.py
@@ -81,7 +81,7 @@ def datasources_handler(_request):
                 "type": "Repository",
                 "relevanceReason": "Implements the checkout flow",
             }
-        ], {"X-CodeAlive-Total-Data-Sources": "3"}
+        ], {"x-codealive-total-data-sources": "3"}  # lowercase: proxies may normalize casing
 
     with mock_codealive_server(
         {("GET", "/api/datasources/ready?query=add+OAuth"): datasources_handler}
@@ -116,6 +116,39 @@ def test_get_datasources_query_fail_open_warns_full_list_returned():
     assert "FULL unfiltered list" in result["message"]
 
 
+def test_get_datasources_query_confident_empty_reports_nothing_relevant():
+    # Empty list + total header: the filter ran and confidently matched nothing.
+    # Must NOT be mistaken for fail-open (fail-open returns the full, non-empty list).
+    def datasources_handler(_request):
+        return 200, [], {"X-CodeAlive-Total-Data-Sources": "3"}
+
+    with mock_codealive_server(
+        {("GET", "/api/datasources/ready?query=add+OAuth"): datasources_handler}
+    ) as (base_url, _requests):
+        client = CodeAliveClient(api_key="skill-test-key", base_url=base_url)
+        result = client.get_datasources(query="add OAuth")
+
+    assert result["dataSources"] == []
+    assert "None of the 3 available data sources are relevant" in result["message"]
+    assert "List without a query" in result["message"]
+
+
+def test_get_datasources_query_empty_org_reports_no_sources():
+    # Empty list and total header reports zero candidates: the org simply has no
+    # data sources — not a relevance verdict, not a filter failure.
+    def datasources_handler(_request):
+        return 200, [], {"X-CodeAlive-Total-Data-Sources": "0"}
+
+    with mock_codealive_server(
+        {("GET", "/api/datasources/ready?query=add+OAuth"): datasources_handler}
+    ) as (base_url, _requests):
+        client = CodeAliveClient(api_key="skill-test-key", base_url=base_url)
+        result = client.get_datasources(query="add OAuth")
+
+    assert result["dataSources"] == []
+    assert result["message"] == "No data sources are available."
+
+
 def test_get_datasources_blank_query_behaves_like_no_query():
     with mock_codealive_server(
         {