diff --git a/python/resolver.py b/python/resolver.py index 7ed4368..df5eb8b 100644 --- a/python/resolver.py +++ b/python/resolver.py @@ -22,6 +22,52 @@ def _add_format_metadata(result: dict, data: dict) -> None: result[field] = data[field] +def _substitute_url_template(template: str, child_data: dict, captured_input: str) -> str: + """Substitute {var} placeholders in a URL template. + + Two substitution modes coexist: + + 1. Explicit variables — a `variables` dict on the child's data, mapping + variable name to `{"extract": ""}`. The extract regex is matched + against the captured input and capture group 1 (or group 0 if no + groups) supplies the variable's value. Example: CWE's `{num}` is + extracted from "CWE-79" via `^CWE-(\\d+)$` -> "79". + + 2. Implicit `{id}` — defaults to the captured input itself unless an + explicit `id` variable was already defined. Example: CVE's + `{id}` in "https://www.cve.org/CVERecord?id={id}" gets "CVE-2021-44228". + + The two modes are layered: implicit-{id} fills in only if not already + set explicitly. Unrecognized {placeholders} are left as-is so they're + visible in output rather than silently swallowed. + """ + if "{" not in template: + return template + + variables: dict[str, str] = {} + + # Mode 1: explicit variables + for var_name, var_def in (child_data.get("variables") or {}).items(): + extract_regex = var_def.get("extract") if isinstance(var_def, dict) else None + if not extract_regex: + continue + try: + m = re.match(extract_regex, captured_input) + except re.error: + continue + if m: + variables[var_name] = m.group(1) if m.groups() else m.group(0) + + # Mode 2: implicit {id} default + variables.setdefault("id", captured_input) + + # Apply substitution + url = template + for name, value in variables.items(): + url = url.replace("{" + name + "}", value) + return url + + def resolve(store: Store, secid_query: str, registry_dirs: list[str] = None) -> dict: """Resolve a SecID string. Returns the API response envelope.""" secid_query = secid_query.strip() @@ -231,13 +277,21 @@ def _build_node_result(node: dict, subpath: Optional[str], version: Optional[str result = {"secid": secid} if child.get("weight"): result["weight"] = child["weight"] - if child_data.get("url"): - result["url"] = child_data["url"] - _add_format_metadata(result, child_data) - result["data"] = { - "description": child.get("description", ""), - **{k: v for k, v in child_data.items() if k != "url"}, - } + url_template = child_data.get("url") + if url_template: + # URL-bearing result: substitute {var} placeholders; + # do NOT include `data` block (canonical contract). + result["url"] = _substitute_url_template(url_template, child_data, subpath) + _add_format_metadata(result, child_data) + else: + # Description-only result: include `data` block + # with descriptive context (variables is internal, + # not exposed in the public response). + _add_format_metadata(result, child_data) + result["data"] = { + "description": child.get("description", ""), + **{k: v for k, v in child_data.items() if k not in ("url", "variables")}, + } return result except re.error: continue @@ -253,13 +307,16 @@ def _build_node_result(node: dict, subpath: Optional[str], version: Optional[str result = {"secid": secid} if child.get("weight"): result["weight"] = child["weight"] - if child_data.get("url"): - result["url"] = child_data["url"] - _add_format_metadata(result, child_data) - result["data"] = { - "description": child.get("description", ""), - **{k: v for k, v in child_data.items() if k != "url"}, - } + url_template = child_data.get("url") + if url_template: + result["url"] = _substitute_url_template(url_template, child_data, version) + _add_format_metadata(result, child_data) + else: + _add_format_metadata(result, child_data) + result["data"] = { + "description": child.get("description", ""), + **{k: v for k, v in child_data.items() if k not in ("url", "variables")}, + } return result except re.error: continue diff --git a/python/test_smoke.py b/python/test_smoke.py index b4e856b..c382b75 100644 --- a/python/test_smoke.py +++ b/python/test_smoke.py @@ -78,6 +78,95 @@ def test_secid_types_single_source(): ) +# --------------------------------------------------------------------------- +# URL template substitution (added in Phase 2.5a) +# --------------------------------------------------------------------------- + + +def test_substitute_template_no_placeholders(): + """Template without {} placeholders is returned verbatim.""" + from resolver import _substitute_url_template + assert _substitute_url_template("https://example.com/static", {}, "anything") == "https://example.com/static" + + +def test_substitute_implicit_id(): + """{id} defaults to the whole captured input when no explicit variable is defined. + Tests the CVE case: pattern is '^CVE-\\d{4}-\\d{4,}$', URL is '...?id={id}'. + """ + from resolver import _substitute_url_template + url = _substitute_url_template( + "https://www.cve.org/CVERecord?id={id}", + {}, + "CVE-2021-44228", + ) + assert url == "https://www.cve.org/CVERecord?id=CVE-2021-44228" + + +def test_substitute_explicit_variable(): + """Explicit {num} via variables.num.extract regex. + Tests the CWE case: extract '^CWE-(\\d+)$' against 'CWE-79' yields '79'.""" + from resolver import _substitute_url_template + child_data = { + "variables": { + "num": {"extract": r"^CWE-(\d+)$", "description": "Numeric CWE ID"} + } + } + url = _substitute_url_template( + "https://cwe.mitre.org/data/definitions/{num}.html", + child_data, + "CWE-79", + ) + assert url == "https://cwe.mitre.org/data/definitions/79.html" + + +def test_substitute_multiple_variables(): + """ATT&CK sub-techniques use {parent} and {sub} from two extract regexes.""" + from resolver import _substitute_url_template + child_data = { + "variables": { + "parent": {"extract": r"^(T\d{4})\.\d{3}$"}, + "sub": {"extract": r"^T\d{4}\.(\d{3})$"}, + } + } + url = _substitute_url_template( + "https://attack.mitre.org/techniques/{parent}/{sub}/", + child_data, + "T1059.003", + ) + assert url == "https://attack.mitre.org/techniques/T1059/003/" + + +def test_substitute_implicit_id_with_explicit_others(): + """Implicit {id} should still default even when explicit variables exist + for other placeholders. Layered, not mutually exclusive.""" + from resolver import _substitute_url_template + child_data = { + "variables": { + "num": {"extract": r"^CAPEC-(\d+)$"}, + } + } + # If a template used both {id} and {num}, both should be filled + url = _substitute_url_template( + "https://example.com/{id}-num{num}", + child_data, + "CAPEC-66", + ) + assert url == "https://example.com/CAPEC-66-num66" + + +def test_substitute_unrecognized_placeholder_left_visible(): + """Unknown {placeholders} are left as-is so they're visible in output + rather than silently swallowed — easier to diagnose registry bugs.""" + from resolver import _substitute_url_template + url = _substitute_url_template( + "https://example.com/{unknown}/{id}", + {}, + "test123", + ) + # {unknown} stays; {id} is implicit-default + assert url == "https://example.com/{unknown}/test123" + + # --------------------------------------------------------------------------- # resolve() basic invariants # ---------------------------------------------------------------------------