diff --git a/tests/conformance/test_observability.py b/tests/conformance/test_observability.py index 24aba6d..5b6fe9a 100644 --- a/tests/conformance/test_observability.py +++ b/tests/conformance/test_observability.py @@ -189,18 +189,144 @@ def _reset_otel_global_tracer_provider(restore_to: object) -> None: ) +_EMBEDDING_DEFER = ( + "embedding capability (proposal 0059) unimplemented until v0.16.0; " + "no embedding event/provider to record from" +) + + +# Pinned observability fixtures NOT run by this YAML harness, each with an +# explicit reason. The coverage guard (test_observability_fixture_coverage_ +# is_complete) fails on any pinned fixture absent from _SUPPORTED_FIXTURES + +# the three sets below, so a future unwired spec fixture cannot silently +# pytest.skip past CI. +# +# _DEFERRED_FIXTURES — not run because the capability is unimplemented. _DEFERRED_FIXTURES: dict[str, str] = { - # Proposal 0045 (nested-lineage augmentation, v0.37.0) — engine - # + observer work lands in PR 11. - "039-nested-lineage-augmentation": ("Proposal 0045 not yet implemented (PR 11)"), - # Proposal 0067 (GenAI metrics, v0.68.0) — the embedding metrics - # fixture sources from an embedding call, but the embedding capability - # (proposal 0059, observability §5.5.8 / §5.5.9) is unimplemented in - # python until v0.16.0, so there is no embedding event or provider to - # record from. The LLM-path metric fixtures (088 / 090 / 091) run. - "089-embedding-metrics-token-and-duration": ( - "Embedding capability (proposal 0059) unimplemented until v0.16.0" + # Proposal 0045 IS implemented (v0.11.0), but the nested-case Langfuse + # fixture stays deferred: it needs runtime-state item-list lookup for + # nested fan-outs plus an augment_metadata_from_outer_item directive + # the harness doesn't model yet. + "039-nested-lineage-augmentation": ( + "nested-case Langfuse harness wiring not yet implemented (proposal 0045 nested fan-out)" ), + # Embedding observability (proposals 0059 / 0067 §11). The embedding + # capability is unshipped until v0.16.0; the LLM-path equivalents run. + **{ + fixture_id: _EMBEDDING_DEFER + for fixture_id in ( + "074-embedding-event-dispatch", + "075-embedding-failure-event-dispatch-on-provider-unavailable", + "076-embedding-event-mutual-exclusion", + "077-embedding-event-call-id-distinct", + "078-embedding-event-input-strings-populated", + "079-embedding-event-request-params-populated", + "080-embedding-event-input-count-and-dimensions-populated", + "081-embedding-event-active-prompt-populated", + "082-otel-embedding-span-attributes", + "083-langfuse-embedding-observation", + "089-embedding-metrics-token-and-duration", + ) + }, +} + + +# _UNIT_TESTED_FIXTURES — implemented behavior covered by the dedicated unit +# suite rather than wired into this YAML harness. Value names the proposal + +# the covering file. +_UNIT_TESTED_FIXTURES: dict[str, str] = { + fixture_id: reason + for fixture_ids, reason in ( + ( + ("022-langfuse-basic-trace", "023-langfuse-generation-rendering", "024-langfuse-prompt-linkage"), + "proposal 0031 Langfuse mapping; covered by test_observability_langfuse.py", + ), + ( + ( + "031-langfuse-subgraph-span-hierarchy", + "032-langfuse-fan-out-per-instance-spans", + "033-langfuse-detached-trace-mode", + ), + "proposal 0035/0061 Langfuse span hierarchy; covered by test_observability_langfuse.py", + ), + ( + ( + "027-langfuse-caller-supplied-metadata", + "029-caller-metadata-fan-out-per-instance", + "034-caller-metadata-open-span-update-serial", + ), + "proposal 0034/0040 caller metadata; covered by test_observability_langfuse.py", + ), + ( + ("030-caller-metadata-parallel-branches-per-branch",), + "proposal 0040 per-branch caller metadata; covered by test_observability_otel.py", + ), + ( + ("035-caller-invocation-id-uuid", "036-caller-invocation-id-non-uuid"), + "proposal 0039 invocation_id derivation; covered by test_observability_langfuse_adapter.py", + ), + ( + ("037-langfuse-trace-input-output",), + "proposal 0043 trace input/output; covered by test_observability_langfuse.py", + ), + ( + ( + "043-get-invocation-metadata-roundtrip", + "044-get-invocation-metadata-fan-out-scoping", + "045-get-invocation-metadata-retry-scoping", + "046-get-invocation-metadata-outside-invocation", + ), + "proposal 0048 get_invocation_metadata; covered by test_observability_metadata.py", + ), + ( + ("059-implementation-attribution-langfuse",), + "proposal 0052 implementation attribution; covered by test_observability_langfuse.py", + ), + ( + ( + "060-llm-completion-event-input-messages-populated", + "061-llm-completion-event-output-content-populated", + "062-llm-completion-event-request-params-populated", + "063-llm-completion-event-request-extras-populated", + "064-llm-completion-event-active-prompt-populated", + "066-llm-completion-event-active-prompt-group-populated", + "067-llm-completion-event-call-id-always-present-and-distinct", + "068-llm-completion-event-response-model-distinct-from-request", + ), + "proposal 0057 LlmCompletionEvent fields; covered by test_llm_provider.py", + ), + ( + ("065-llm-completion-event-active-prompt-null",), + "proposal 0057 active_prompt null case; covered by test_observability_otel.py", + ), + ( + ( + "069-llm-failure-event-dispatch-on-provider-unavailable", + "070-llm-failure-event-dispatch-on-provider-invalid-request", + "071-llm-failure-event-call-id-distinct-from-completion-event", + "072-llm-failure-event-mutual-exclusion-with-completion-event", + "073-llm-failure-event-error-type-vendor-specific", + ), + "proposal 0058 LlmFailedEvent; covered by test_llm_provider.py", + ), + ) + for fixture_id in fixture_ids +} + + +# _CONVENTION_ONLY_FIXTURES — proposal 0048 §9 queryable-observer pattern is +# convention-only (no new abstract surface on Observer), satisfied via +# docs/concepts/observability.md, so there is no library API to assert. +_CONVENTION_ONLY_FIXTURES: dict[str, str] = { + fixture_id: ( + "proposal 0048 §9 queryable-observer pattern is convention-only " + "(no library surface); satisfied by docs/concepts/observability.md" + ) + for fixture_id in ( + "047-queryable-observer-pattern", + "048-queryable-observer-async-safety", + "049-queryable-observer-lifecycle-drop", + ) } @@ -224,6 +350,37 @@ def _load(path: Path) -> dict[str, Any]: return cast("dict[str, Any]", yaml.safe_load(f)) +def test_observability_fixture_coverage_is_complete() -> None: + # Fail-on-unknown guard. Every pinned observability conformance fixture + # MUST be either run (_SUPPORTED_FIXTURES) or explicitly accounted for: + # _DEFERRED_FIXTURES (future capability), _UNIT_TESTED_FIXTURES (covered + # by the unit suite, not this YAML harness), or _CONVENTION_ONLY_FIXTURES + # (doc-satisfied, no library surface). A new spec fixture that is none of + # these fails HERE rather than silently pytest.skip-ping past CI. + all_ids = {p.stem for p in _fixture_paths()} + accounted = ( + set(_SUPPORTED_FIXTURES) + | _DEFERRED_FIXTURES.keys() + | _UNIT_TESTED_FIXTURES.keys() + | _CONVENTION_ONLY_FIXTURES.keys() + ) + unaccounted = sorted(all_ids - accounted) + assert not unaccounted, ( + "unaccounted observability conformance fixtures: wire each into " + "_SUPPORTED_FIXTURES once it runs, or document it in _DEFERRED_FIXTURES " + "(future capability) / _UNIT_TESTED_FIXTURES (covered by the unit suite) " + f"/ _CONVENTION_ONLY_FIXTURES (doc-satisfied): {unaccounted}" + ) + # An accounting entry whose fixture no longer exists on disk (renamed at + # a pin bump) should be removed. + stale = sorted(accounted - all_ids) + assert not stale, f"accounting entries with no fixture file (remove): {stale}" + # A fixture cannot be both run and documented-as-not-run. + not_run = _DEFERRED_FIXTURES.keys() | _UNIT_TESTED_FIXTURES.keys() | _CONVENTION_ONLY_FIXTURES.keys() + overlap = sorted(set(_SUPPORTED_FIXTURES) & not_run) + assert not overlap, f"fixtures both run and documented-as-not-run (pick one): {overlap}" + + # --------------------------------------------------------------------------- # Per-fixture dispatcher # --------------------------------------------------------------------------- @@ -232,10 +389,18 @@ def _load(path: Path) -> dict[str, Any]: @pytest.mark.parametrize("fixture_path", _fixture_paths(), ids=_fixture_id) async def test_observability_fixture(fixture_path: Path) -> None: fixture_id = fixture_path.stem - if fixture_id in _DEFERRED_FIXTURES: - pytest.skip(f"{fixture_id}: {_DEFERRED_FIXTURES[fixture_id]}") + skip_reason = ( + _DEFERRED_FIXTURES.get(fixture_id) + or _UNIT_TESTED_FIXTURES.get(fixture_id) + or _CONVENTION_ONLY_FIXTURES.get(fixture_id) + ) + if skip_reason is not None: + pytest.skip(f"{fixture_id}: {skip_reason}") if fixture_id not in _SUPPORTED_FIXTURES: - pytest.skip(f"{fixture_id}: harness wiring not yet implemented") + # Unaccounted: neither wired nor documented. The coverage guard + # (test_observability_fixture_coverage_is_complete) fails loudly + # listing every such fixture; the individual case skips here. + pytest.skip(f"{fixture_id}: unaccounted -- see the coverage guard") spec = _load(fixture_path) if fixture_id == "001-otel-basic-trace": diff --git a/tests/unit/test_llm_provider.py b/tests/unit/test_llm_provider.py index 5a696fc..9dd4225 100644 --- a/tests/unit/test_llm_provider.py +++ b/tests/unit/test_llm_provider.py @@ -1782,6 +1782,86 @@ def _handler(_req: httpx.Request) -> httpx.Response: assert typed.finish_reason == "tool_calls" +async def test_llm_completion_event_active_prompt_populated_from_context() -> None: + # Proposal 0057 active_prompt: complete() invoked inside a + # with_active_prompt block stamps the active PromptResult onto the + # typed event (the provider reads current_prompt_result()). Covers + # conformance fixture 064 -- the populated record on the EVENT, not + # just the observer's span rendering of an injected field. + from datetime import UTC, datetime + + from openarmature.prompts import PromptResult, with_active_prompt + + now = datetime.now(UTC) + pr = PromptResult( + name="greeting", + version="1", + label="production", + template_hash="sha256:tmpl", + rendered_hash="sha256:rendered", + messages=[UserMessage(content="hi")], + variables={"user": "Alice"}, + fetched_at=now, + rendered_at=now, + ) + events, token = _collecting_dispatch() + transport = _make_openai_response_with_usage( + {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2} + ) + provider = OpenAIProvider(base_url="http://test", model="m", api_key="k", transport=transport) + try: + with with_active_prompt(pr): + await provider.complete([UserMessage(content="hi")]) + finally: + await provider.aclose() + _release_dispatch(token) + + typed = next(e for e in events if isinstance(e, LlmCompletionEvent)) + assert typed.active_prompt == pr + + +async def test_llm_completion_event_active_prompt_group_populated_from_context() -> None: + # Proposal 0057 active_prompt_group: complete() inside a + # with_active_prompt_group block stamps the active PromptGroup onto + # the typed event (the provider reads current_prompt_group()). Covers + # conformance fixture 066. + from datetime import UTC, datetime + + from openarmature.prompts import PromptGroup, PromptResult, with_active_prompt_group + + now = datetime.now(UTC) + + def _pr(name: str) -> PromptResult: + return PromptResult( + name=name, + version="1", + label="production", + template_hash="sha256:tmpl", + rendered_hash="sha256:rendered", + messages=[UserMessage(content="hi")], + variables={"user": "Alice"}, + fetched_at=now, + rendered_at=now, + ) + + # PromptGroup requires N>=2 members. + group = PromptGroup(group_name="greetings", members=[_pr("greeting"), _pr("farewell")]) + events, token = _collecting_dispatch() + transport = _make_openai_response_with_usage( + {"prompt_tokens": 1, "completion_tokens": 1, "total_tokens": 2} + ) + provider = OpenAIProvider(base_url="http://test", model="m", api_key="k", transport=transport) + try: + with with_active_prompt_group(group): + await provider.complete([UserMessage(content="hi")]) + finally: + await provider.aclose() + _release_dispatch(token) + + typed = next(e for e in events if isinstance(e, LlmCompletionEvent)) + assert typed.active_prompt_group == group + + async def test_llm_completion_event_request_params_only_carries_supplied_keys() -> None: # Proposal 0057 request_params shape: absence-is-meaningful. Only # caller-supplied gen_ai.request.* keys appear; unset RuntimeConfig