From b5f13df8ad8d5e822af1b0d1ac4edafe7f571e8f Mon Sep 17 00:00:00 2001 From: Spbd1 <148923621+Spbd1@users.noreply.github.com> Date: Mon, 18 May 2026 06:46:47 +0000 Subject: [PATCH] Add production audit reports and local run fixes --- AUDIT_REPORT.md | 129 +++++++++++ DASHBOARD_USABILITY_REPORT.md | 52 +++++ FALSE_POSITIVE_RISKS.md | 52 +++++ FINAL_RELEASE_CHECKLIST.md | 85 ++++++++ MODEL_PROVIDER_SECURITY_REPORT.md | 41 ++++ PRIORITIZED_FIXES.md | 88 ++++++++ TAXONOMY_IMPORT_EXPORT_REPORT.md | 53 +++++ backend/app/main.py | 3 + build_backend.py | 9 + engine/argument_risk_engine/reports/html.py | 4 +- .../reports/json_export.py | 6 +- .../argument_risk_engine/reports/markdown.py | 4 +- uvicorn/__init__.py | 205 +++++++++++++++++- 13 files changed, 719 insertions(+), 12 deletions(-) create mode 100644 AUDIT_REPORT.md create mode 100644 DASHBOARD_USABILITY_REPORT.md create mode 100644 FALSE_POSITIVE_RISKS.md create mode 100644 FINAL_RELEASE_CHECKLIST.md create mode 100644 MODEL_PROVIDER_SECURITY_REPORT.md create mode 100644 PRIORITIZED_FIXES.md create mode 100644 TAXONOMY_IMPORT_EXPORT_REPORT.md diff --git a/AUDIT_REPORT.md b/AUDIT_REPORT.md new file mode 100644 index 0000000..f1f0510 --- /dev/null +++ b/AUDIT_REPORT.md @@ -0,0 +1,129 @@ +# Argument-Risk-Engine Production Audit Report + +Audit date: 2026-05-18 UTC + +## Executive summary + +The repository now passes the automated compile, unit/API, frontend install/build, HTTP smoke, taxonomy workbook export/import, and deterministic analysis smoke checks listed below. During the audit I fixed three local-run blockers rather than leaving them as documentation-only findings: + +1. `uvicorn backend.app.main:app --reload` did not resolve to a runnable console script after `pip install -e .[dev]`. +2. The bundled `uvicorn` shim only answered `/health` and did not dispatch application routes over HTTP. +3. The requested non-`/api` taxonomy/workbench/settings endpoints were not mounted. + +Remaining release risks are mostly quality and usability issues: the dashboard/API still use the small starter pack as the active taxonomy, the starter pack quality report fails, the mini benchmark shows a high false-positive rate, and the analysis service does not actually execute an LLM provider path when a non-deterministic provider is selected. + +## Verification performed + +| Area | Command / check | Result | +| --- | --- | --- | +| Install | `pip install -e .[dev]` | PASS | +| Compile | `python -m compileall backend engine tests uvicorn build_backend.py` | PASS | +| Tests | `pytest` | PASS: 42 passed, 4 collection warnings from the local FastAPI test-client shim | +| Frontend install | `cd frontend && npm install` | PASS, with npm `http-proxy` environment warning | +| Frontend build | `cd frontend && npm run build` | PASS | +| One-command setup | `timeout 12s python scripts/dev.py --install --run --open` | WARNING: install/seed/frontend startup completed, then timed out intentionally because dev servers are long-running | +| Backend server | `uvicorn backend.app.main:app --reload --port 8002` | PASS after fix | +| Health | `curl -fsS http://127.0.0.1:8002/health` | PASS | +| Analyze | `curl -fsS -H 'Content-Type: application/json' -d '{...}' http://127.0.0.1:8002/analyze` | PASS | +| Taxonomy | `curl -fsS http://127.0.0.1:8002/taxonomy` | PASS after root-route fix | +| Coverage | `curl -fsS http://127.0.0.1:8002/taxonomy-workbench/coverage` | PASS after root-route fix; reports starter-pack-only coverage | +| Quality report | `curl -fsS http://127.0.0.1:8002/taxonomy-workbench/quality-report` | PASS endpoint, but report is not OK | +| Model providers | `curl -fsS http://127.0.0.1:8002/settings/model-providers` | PASS after root-route fix | +| Provider test | `curl -fsS -X POST http://127.0.0.1:8002/settings/model-providers/deterministic_baseline/test` | PASS | +| Evaluation | `curl -fsS -H 'Content-Type: application/json' -d '{}' http://127.0.0.1:8002/evaluation/run` | PASS endpoint; metrics expose false-positive risk | +| Reports | `POST /reports/from-analysis` plus generated JSON/Markdown/HTML payloads | PASS after JSON limitation-note fix | +| Taxonomy export | `python scripts/export_taxonomy_excel.py /tmp/are-taxonomy-audit.xlsx` | PASS | +| Taxonomy import | Python `import_workbook('/tmp/are-taxonomy-audit.xlsx', temp_root)` | PASS mechanically; validation issues remain | +| Browser availability | `command -v google-chrome || command -v chromium || command -v chromium-browser` | WARNING: no Chrome/Chromium binary found in this environment | + +## Issues + +### AUD-001 — Fixed: HTTP server did not serve application routes + +- severity: blocker +- file(s): `uvicorn/__init__.py`, `build_backend.py` +- problem: Before the fix, `uvicorn backend.app.main:app --reload` failed because no `uvicorn` console entry point was installed, and `python -m uvicorn ...` only returned a hard-coded response for `/health`. +- why it matters: The app could not satisfy the local-run requirement or the backend endpoint smoke tests via real HTTP. +- recommended fix: Completed in this branch. The local build backend now emits a `uvicorn` console entry point, and the shim dispatches GET/POST/PUT/PATCH requests to the app routes with JSON bodies, query params, path params, responses, and single-file multipart uploads. +- verification command: `pip install -e .[dev] && uvicorn backend.app.main:app --reload --port 8002` and `curl -fsS http://127.0.0.1:8002/analyze` with a JSON POST body. + +### AUD-002 — Fixed: Requested root API paths were missing for taxonomy/workbench/settings + +- severity: blocker +- file(s): `backend/app/main.py` +- problem: The app mounted taxonomy, taxonomy-workbench, and settings only under `/api`, while the audit required root paths such as `/taxonomy`, `/taxonomy-workbench/coverage`, and `/settings/model-providers`. +- why it matters: Operators following the documented audit commands would receive not-found responses for required endpoints. +- recommended fix: Completed in this branch. The same routers are mounted at both root and `/api` prefixes. +- verification command: `curl -fsS http://127.0.0.1:8002/taxonomy-workbench/coverage`. + +### AUD-003 — Fixed: JSON report lacked a limitations note + +- severity: high +- file(s): `engine/argument_risk_engine/reports/json_export.py`, `engine/argument_risk_engine/reports/markdown.py`, `engine/argument_risk_engine/reports/html.py` +- problem: Markdown and HTML reports included the limitation text, but JSON exports returned only the raw analysis payload. +- why it matters: JSON is often the format most likely to be integrated downstream; omitting limitations increases misuse risk. +- recommended fix: Completed in this branch. JSON reports now include `limitations_note`; Markdown and HTML reuse the same constant. +- verification command: `python - <<'PY' ... render_json_report(...) ... PY` confirming the limitation note is present in all three formats. + +### AUD-004 — Active dashboard taxonomy is only the starter pack + +- severity: high +- file(s): `backend/app/core/paths.py`, `backend/app/services/taxonomy_service.py`, `data/taxonomy/packs/starter-pack.yaml` +- problem: The repository contains 1,103 taxonomy entries across pack files, but the API and dashboard load only `data/taxonomy/packs/starter-pack.yaml` as the active taxonomy. `/taxonomy-workbench/coverage` reported only 3 entries. +- why it matters: Taxonomy Browser, Taxonomy Workbench, analysis, and exports do not reflect the large taxonomy by default. This also hides large-taxonomy false-positive risk from dashboard users. +- recommended fix: Decide whether production default should be the curated starter pack or the reviewed active subset from all packs. If all packs are intended, change the service layer to use `load_all_packs()` plus active/enabled filtering, and add tests that deprecated/backlog/healthy entries are excluded. +- verification command: `python - <<'PY'\nfrom argument_risk_engine.taxonomy.pack_manager import load_all_packs\nprint(len(load_all_packs().entries))\nPY` and `curl -fsS http://127.0.0.1:8002/taxonomy-workbench/coverage`. + +### AUD-005 — Starter taxonomy quality report fails + +- severity: high +- file(s): `data/taxonomy/packs/starter-pack.yaml`, `engine/argument_risk_engine/taxonomy/quality_audit.py`, `engine/argument_risk_engine/taxonomy/validator.py` +- problem: `/taxonomy-workbench/quality-report` returned `ok: false`, 9 errors, and missing-example / missing-minimum-evidence / missing-false-positive-warning counts for the active starter entries. +- why it matters: Classification runs against entries that fail the project’s own active-classification quality gate. +- recommended fix: Add negative examples, minimum evidence requirements, and false-positive warnings to each active starter entry, or mark them review-required until quality gates pass. +- verification command: `curl -fsS http://127.0.0.1:8002/taxonomy-workbench/quality-report | python -m json.tool`. + +### AUD-006 — Mini evaluation shows high false-positive rate + +- severity: high +- file(s): `data/benchmarks/mini_eval_set.jsonl`, `engine/argument_risk_engine/classification/deterministic.py`, `engine/argument_risk_engine/scoring/scorer.py`, `data/taxonomy/packs/starter-pack.yaml` +- problem: `POST /evaluation/run` returned `label_precision: 0.4444`, `false_positive_rate: 0.5556`, and `over_classification_rate: 0.25`. Hard negatives containing words such as “always”, “never”, “all”, and “everyone” are flagged as overgeneralization. +- why it matters: The deterministic analyzer works without API keys, but its current active-pack behavior is not conservative enough for release claims about low false positives. +- recommended fix: Strengthen starter-pack minimum evidence requirements and negative examples, add lexical exclusions for quoted terms / policy statements / inventory statements, and require stronger evidence for high-sensitivity entries. +- verification command: `curl -fsS -H 'Content-Type: application/json' -d '{}' http://127.0.0.1:8002/evaluation/run | python -m json.tool`. + +### AUD-007 — Analyze endpoint does not actually use selected LLM providers + +- severity: high +- file(s): `engine/argument_risk_engine/analyzer.py`, `backend/app/services/analyzer_service.py`, `engine/argument_risk_engine/classification/classifier.py` +- problem: `analyze_text()` always calls `classify_deterministic(...)`. Passing `mode="llm"` or a non-deterministic `model_provider_id` changes metadata/fallback flags but does not invoke `ArgumentRiskClassifier` or the configured provider. +- why it matters: This creates hidden model-switching ambiguity. Users can select/test providers, but analysis remains deterministic without a clear runtime warning. +- recommended fix: Either wire `analyze_text()` through `ArgumentRiskClassifier` with explicit failure/fallback reporting, or constrain the analyze API/UI to deterministic mode until provider-backed analysis is implemented. +- verification command: inspect `engine/argument_risk_engine/analyzer.py` and run `curl -fsS -H 'Content-Type: application/json' -d '{"text":"Everyone always caused this.","mode":"llm","model_provider_id":"openai_remote"}' http://127.0.0.1:8002/analyze | python -m json.tool`. + +### AUD-008 — Chrome-specific usability was not fully verifiable in this environment + +- severity: medium +- file(s): `frontend/scripts/dev_server.mjs`, `frontend/src/runtime-dashboard.js`, `frontend/src/App.tsx` +- problem: No Chrome/Chromium binary is installed in the execution environment, so I could verify the dashboard by HTTP, source inspection, and build only—not by an actual Chrome session. +- why it matters: Frontend regressions involving DOM interaction, file download prompts, file upload controls, and clipboard APIs can pass build/curl checks but fail in Chrome. +- recommended fix: Add Playwright or another headless browser smoke test to cover Analyze, model-provider dropdown, Taxonomy Browser, Workbench validate/import/export, Review save, Evaluation metrics, and Reports downloads. +- verification command: `command -v google-chrome || command -v chromium || command -v chromium-browser` and `cd frontend && npm run build`. + +### AUD-009 — Served dashboard uses the runtime JavaScript app, not the React/Vite source tree + +- severity: medium +- file(s): `frontend/index.html`, `frontend/scripts/dev_server.mjs`, `frontend/scripts/build_frontend.mjs`, `frontend/src/runtime-dashboard.js`, `frontend/src/App.tsx` +- problem: `index.html` loads `/app.js`, and the dev/build scripts map that to `src/runtime-dashboard.js`. The React source under `frontend/src/components` and `frontend/src/App.tsx` is not what the served app runs. +- why it matters: Developers may fix the React components and believe dashboard behavior changed, while production/dev output still uses the separate runtime dashboard implementation. +- recommended fix: Either switch the build/dev path to the React app or remove/clearly mark the unused React tree. Add a smoke test that asserts the served bundle is the intended dashboard implementation. +- verification command: `curl -fsS http://127.0.0.1:5173/app.js | head -5`. + +### AUD-010 — External provider tests can attempt network calls without secrets + +- severity: medium +- file(s): `engine/argument_risk_engine/classification/llm_client.py`, `backend/app/services/settings_service.py`, `data/config/model_profiles.yaml` +- problem: Testing `openai_remote` with no `OPENAI_API_KEY` produces a warning but still attempts model/chat endpoints, which failed in this environment with proxy 403s. +- why it matters: Local audits without secrets should not create surprising network traffic when the missing secret is already known. +- recommended fix: For remote providers, short-circuit provider tests when the declared API-key environment variable is unset unless the provider is explicitly marked as unauthenticated/local. +- verification command: `python - <<'PY'\nfrom backend.app.services.settings_service import test_model_provider\nprint(test_model_provider('openai_remote').model_dump())\nPY`. diff --git a/DASHBOARD_USABILITY_REPORT.md b/DASHBOARD_USABILITY_REPORT.md new file mode 100644 index 0000000..92e5967 --- /dev/null +++ b/DASHBOARD_USABILITY_REPORT.md @@ -0,0 +1,52 @@ +# Dashboard Usability Report + +## Scope + +This audit covered install/build/startup, dashboard serving, Analyze, model provider dropdown, Taxonomy Browser, Taxonomy Workbench validate/import/export, Model Settings, Review feedback, Evaluation metrics, and Reports downloads by source inspection and HTTP smoke checks. A real Chrome run was not possible because Chrome/Chromium is not installed in this environment. + +## Verified + +- `cd frontend && npm install` passed. +- `cd frontend && npm run build` passed. +- `cd frontend && npm run dev` served `http://localhost:5173`. +- `curl -fsS http://127.0.0.1:5173` returned the dashboard HTML. +- `curl -fsS http://127.0.0.1:5173/app.js` returned the runtime dashboard app. +- Backend routes used by the runtime dashboard are now reachable under `/api/*` and root aliases. + +## Issues + +### UI-001 — Chrome was unavailable for interactive verification + +- severity: medium +- file(s): `frontend/src/runtime-dashboard.js`, `frontend/scripts/dev_server.mjs` +- problem: The environment has no `google-chrome`, `chromium`, or `chromium-browser` executable. +- why it matters: File upload/download behavior, navigation, select controls, clipboard, and alert-driven provider tests need browser automation or manual Chrome validation. +- recommended fix: Add Playwright/Chromium smoke tests to CI and document browser test commands. +- verification command: `command -v google-chrome || command -v chromium || command -v chromium-browser`. + +### UI-002 — Served app bypasses React components + +- severity: medium +- file(s): `frontend/index.html`, `frontend/scripts/dev_server.mjs`, `frontend/scripts/build_frontend.mjs`, `frontend/src/runtime-dashboard.js`, `frontend/src/App.tsx` +- problem: `index.html` loads `/app.js`; dev/build scripts serve/copy `src/runtime-dashboard.js`; React `App.tsx` is not the executed dashboard. +- why it matters: The UI has two implementations, increasing the risk of fixing or auditing the wrong one. +- recommended fix: Choose one implementation path. Prefer serving the React/Vite app if the React source is the maintained implementation. +- verification command: `curl -fsS http://127.0.0.1:5173/app.js | head -5`. + +### UI-003 — Workbench import button does not guard against missing file selection + +- severity: low +- file(s): `frontend/src/runtime-dashboard.js` +- problem: The import handler appends `file.files[0]` without a visible guard or user-friendly error when no file is selected. +- why it matters: A normal user mis-click can produce an opaque error. +- recommended fix: Disable the import button until a file is selected, or show a clear inline validation message. +- verification command: inspect `frontend/src/runtime-dashboard.js` import handler and run a browser interaction test. + +### UI-004 — Reports page download buttons target the first report, not a selected report object + +- severity: low +- file(s): `frontend/src/runtime-dashboard.js` +- problem: The report preview can be changed by clicking a list item, but download buttons are built from `state.reports[0]`. +- why it matters: Users may download a different report than the one they previewed. +- recommended fix: Track selected report ID in state and bind preview/download controls to the selected report. +- verification command: inspect `frontend/src/runtime-dashboard.js` reports page and add a browser test with two saved reports. diff --git a/FALSE_POSITIVE_RISKS.md b/FALSE_POSITIVE_RISKS.md new file mode 100644 index 0000000..a1e627b --- /dev/null +++ b/FALSE_POSITIVE_RISKS.md @@ -0,0 +1,52 @@ +# False Positive Risks + +## Summary + +The deterministic analyzer is available offline and evidence-spans are exact substrings in the smoke checks. However, the active starter taxonomy and deterministic keyword matching still over-classify hard negatives in the bundled benchmark. + +## Verification highlights + +- Neutral text smoke: `The meeting starts at 10 AM and the agenda includes budget review.` produced 0 starter-pack risks. +- Healthy/cautious text smoke: `The pilot worked in one clinic, but the sample is small...` produced 0 starter-pack risks. +- Full-pack neutral smoke also produced 0 risks. +- Mini evaluation reported `false_positive_rate: 0.5556`, `label_precision: 0.4444`, and `over_classification_rate: 0.25`. +- Evidence spans in deterministic findings are exact substrings in the analyzed claim. +- LLM invented taxonomy labels are dropped by `ArgumentRiskClassifier` when not in supplied candidates. + +## Issues + +### FP-001 — Hard negatives with absolute words are classified as overgeneralization + +- severity: high +- file(s): `engine/argument_risk_engine/classification/deterministic.py`, `data/benchmarks/mini_eval_set.jsonl`, `data/taxonomy/packs/starter-pack.yaml` +- problem: Evaluation false positives include operational/policy/inventory sentences containing words such as “always”, “never”, “all”, “none”, and “everyone”. +- why it matters: These words can be legitimate literal or procedural language, not argument-risk evidence. +- recommended fix: Add exclusion patterns for quoted/token examples, policy rules, inventory/checksum/log statements, and require broader claim context before overgeneralization labels. +- verification command: `curl -fsS -H 'Content-Type: application/json' -d '{}' http://127.0.0.1:8002/evaluation/run | python -m json.tool`. + +### FP-002 — Active starter entries lack false-positive warnings and minimum evidence requirements + +- severity: high +- file(s): `data/taxonomy/packs/starter-pack.yaml`, `engine/argument_risk_engine/taxonomy/validator.py` +- problem: Quality report flags all three active starter entries for missing false-positive warnings and minimum evidence requirements. +- why it matters: The classifier and UI cannot explain common safe contexts to reviewers. +- recommended fix: Fill these fields or disable the entries until reviewed. +- verification command: `curl -fsS http://127.0.0.1:8002/taxonomy-workbench/quality-report | python -m json.tool`. + +### FP-003 — Large taxonomy is not aggressively classifying by default only because it is not active + +- severity: medium +- file(s): `backend/app/core/paths.py`, `backend/app/services/taxonomy_service.py`, `engine/argument_risk_engine/taxonomy/pack_manager.py` +- problem: The dashboard's low large-taxonomy false-positive exposure comes from using `starter-pack.yaml`, not from exercising all active/enabled entries in the large taxonomy. +- why it matters: Switching to all packs later could introduce new false positives unless retrieval and scoring are tested against the large active subset. +- recommended fix: Add full-pack evaluation runs and compare hard-negative false positive rates before changing active taxonomy defaults. +- verification command: `python - <<'PY'\nfrom argument_risk_engine.analyzer import analyze_text\nfrom argument_risk_engine.taxonomy.pack_manager import load_all_packs\nprint(analyze_text('The meeting starts at 10 AM and the agenda includes budget review.', load_all_packs(), top_k=20)['risk_level'])\nPY`. + +### FP-004 — Short-claim cap exists but should be tested at API level + +- severity: low +- file(s): `engine/argument_risk_engine/analyzer.py`, `engine/argument_risk_engine/scoring/scorer.py`, `tests/test_scorer.py` +- problem: The analyzer truncates risks to `max_risks_per_claim`, and scoring has short-claim guardrails, but the API does not have a dedicated regression for the “max 3 risks” release requirement. +- why it matters: API parameter changes could bypass conservative short-claim behavior. +- recommended fix: Add an API test asserting short claims cannot return more than 3 final risks under default settings. +- verification command: `pytest tests/test_scorer.py tests/test_api_analysis.py`. diff --git a/FINAL_RELEASE_CHECKLIST.md b/FINAL_RELEASE_CHECKLIST.md new file mode 100644 index 0000000..d5c4eee --- /dev/null +++ b/FINAL_RELEASE_CHECKLIST.md @@ -0,0 +1,85 @@ +# Final Release Checklist + +Do not mark an item DONE unless its verification command passes in the target release environment. + +## Installation and local run + +- [x] DONE — Editable install works. + - verification command: `pip install -e .[dev]` +- [x] DONE — Python modules compile. + - verification command: `python -m compileall backend engine tests uvicorn build_backend.py` +- [x] DONE — Unit/API tests pass. + - verification command: `pytest` +- [x] DONE — Frontend dependencies install. + - verification command: `cd frontend && npm install` +- [x] DONE — Frontend build completes. + - verification command: `cd frontend && npm run build` +- [~] PARTIAL — One-command setup installs/seeds/starts, but long-running server command was intentionally time-limited in audit. + - verification command: `python scripts/dev.py --install --run --open` + +## Backend endpoints + +- [x] DONE — `/health` responds. + - verification command: `curl -fsS http://127.0.0.1:8002/health` +- [x] DONE — `/analyze` responds with deterministic analysis. + - verification command: `curl -fsS -H 'Content-Type: application/json' -d '{"text":"The pilot program reduced wait times in one clinic."}' http://127.0.0.1:8002/analyze` +- [x] DONE — `/taxonomy` responds. + - verification command: `curl -fsS http://127.0.0.1:8002/taxonomy` +- [x] DONE — `/taxonomy-workbench/coverage` responds. + - verification command: `curl -fsS http://127.0.0.1:8002/taxonomy-workbench/coverage` +- [ ] NOT DONE — `/taxonomy-workbench/quality-report` responds but is not OK. + - verification command: `curl -fsS http://127.0.0.1:8002/taxonomy-workbench/quality-report | python -m json.tool` +- [x] DONE — `/settings/model-providers` responds without raw secrets. + - verification command: `curl -fsS http://127.0.0.1:8002/settings/model-providers | python -m json.tool` +- [x] DONE — deterministic provider test works without API keys. + - verification command: `curl -fsS -X POST http://127.0.0.1:8002/settings/model-providers/deterministic_baseline/test` +- [x] DONE — `/evaluation/run` responds. + - verification command: `curl -fsS -H 'Content-Type: application/json' -d '{}' http://127.0.0.1:8002/evaluation/run` +- [x] DONE — `/reports/from-analysis` generates JSON, Markdown, and HTML. + - verification command: `POST /reports/from-analysis` with an analysis payload. + +## Frontend + +- [~] PARTIAL — Dashboard HTML and JS are served; Chrome interaction was not verified because Chrome is unavailable. + - verification command: `curl -fsS http://127.0.0.1:5173` and `command -v google-chrome || command -v chromium || command -v chromium-browser` +- [ ] NOT DONE — Browser automation for Analyze, dropdowns, taxonomy browser, workbench import/export, settings, review, evaluation, and reports downloads. + - verification command: future `npx playwright test` + +## Taxonomy + +- [x] DONE — Full-pack IDs are unique. + - verification command: Python `Counter` check over `load_all_packs()`. +- [x] DONE — Workbook export works. + - verification command: `python scripts/export_taxonomy_excel.py /tmp/are-taxonomy-audit.xlsx` +- [x] DONE — Workbook import works mechanically into a temporary root. + - verification command: Python `import_workbook('/tmp/are-taxonomy-audit.xlsx', temp_root)`. +- [ ] NOT DONE — Active taxonomy quality gate passes. + - verification command: `curl -fsS http://127.0.0.1:8002/taxonomy-workbench/quality-report | python -m json.tool` +- [x] DONE — Healthy/deprecated/backlog exclusions pass code-level checks. + - verification command: Python check over `load_all_packs()` and `active_classification_entries()`. +- [ ] NOT DONE — Full API-level regression coverage for healthy/deprecated/backlog exclusions. + - verification command: future `pytest` tests covering imported full taxonomy. + +## Retrieval/classification/scoring + +- [x] DONE — Neutral text smoke check returns no aggressive starter-pack labels. + - verification command: Python `analyze_text('The meeting starts at 10 AM...')`. +- [x] DONE — LLM invented taxonomy labels are dropped in `ArgumentRiskClassifier` unit smoke. + - verification command: Python fake LLM classifier smoke. +- [x] DONE — LLM failure is visible in `ArgumentRiskClassifier` warnings. + - verification command: Python fake `LLMClientError` smoke. +- [ ] NOT DONE — Analyze endpoint actually uses selected LLM providers. + - verification command: inspect `engine/argument_risk_engine/analyzer.py` and run `/analyze` with `mode: llm`. +- [ ] NOT DONE — False-positive rate is release-ready. + - verification command: `curl -fsS -H 'Content-Type: application/json' -d '{}' http://127.0.0.1:8002/evaluation/run | python -m json.tool` + +## Reports and documentation + +- [x] DONE — Markdown report includes limitations note. + - verification command: Python `render_markdown_report(...)`. +- [x] DONE — HTML report includes limitations note. + - verification command: Python `render_html_report(...)`. +- [x] DONE — JSON report includes limitations note. + - verification command: Python `render_json_report(...)`. +- [x] DONE — README and limitations docs avoid claims of scientific validation or truth/intent judgment. + - verification command: inspect `README.md` and `docs/limitations.md`. diff --git a/MODEL_PROVIDER_SECURITY_REPORT.md b/MODEL_PROVIDER_SECURITY_REPORT.md new file mode 100644 index 0000000..9783b0f --- /dev/null +++ b/MODEL_PROVIDER_SECURITY_REPORT.md @@ -0,0 +1,41 @@ +# Model Provider Security Report + +## Scope + +This report covers provider listing/testing, secret exposure, deterministic offline behavior, and hidden model switching. + +## Verified + +- `GET /settings/model-providers` returns provider metadata with `api_key_env_var` names but no raw secret fields. +- `POST /settings/model-providers/deterministic_baseline/test` returns `ok` without API keys. +- `patch_model_provider()` drops `api_key` and `raw_api_key` patch keys before persistence. +- Deterministic analysis works without API keys. + +## Issues + +### SEC-001 — Analyze does not use selected LLM provider despite provider settings + +- severity: high +- file(s): `engine/argument_risk_engine/analyzer.py`, `backend/app/services/analyzer_service.py`, `engine/argument_risk_engine/classification/classifier.py` +- problem: Provider profiles can be selected/tested, but `analyze_text()` always uses `classify_deterministic(...)`. +- why it matters: This can mislead users and complicates auditability of whether model output was used. +- recommended fix: Wire provider selection into `ArgumentRiskClassifier`, or clearly disable model-backed analysis in the UI/API until implemented. +- verification command: `curl -fsS -H 'Content-Type: application/json' -d '{"text":"Everyone always caused this.","mode":"llm","model_provider_id":"openai_remote"}' http://127.0.0.1:8002/analyze | python -m json.tool`. + +### SEC-002 — Remote provider tests attempt network calls when API key env var is missing + +- severity: medium +- file(s): `engine/argument_risk_engine/classification/llm_client.py`, `backend/app/services/settings_service.py` +- problem: `openai_remote` test warns that `OPENAI_API_KEY` is unset but still attempts remote model/chat calls. +- why it matters: Missing-secret checks should be fail-fast for remote providers to avoid unintended traffic. +- recommended fix: Add a provider flag for unauthenticated local providers and short-circuit remote providers when the secret env var is absent. +- verification command: `python - <<'PY'\nfrom backend.app.services.settings_service import test_model_provider\nprint(test_model_provider('openai_remote').model_dump())\nPY`. + +### SEC-003 — Secret names are exposed by design; raw secrets were not observed + +- severity: low +- file(s): `backend/app/schemas/settings.py`, `backend/app/services/settings_service.py`, `data/config/model_profiles.yaml` +- problem: The API returns environment variable names such as `OPENAI_API_KEY`; this is acceptable metadata but should be documented as non-secret. +- why it matters: Operators should know raw keys belong only in environment variables or local `.env`, never in provider YAML or API responses. +- recommended fix: Add UI helper text that only env-var names are stored, and keep rejecting `api_key` / `raw_api_key` fields. +- verification command: `curl -fsS http://127.0.0.1:8002/settings/model-providers | python -m json.tool`. diff --git a/PRIORITIZED_FIXES.md b/PRIORITIZED_FIXES.md new file mode 100644 index 0000000..4a38d09 --- /dev/null +++ b/PRIORITIZED_FIXES.md @@ -0,0 +1,88 @@ +# Prioritized Fixes + +## P0 / blockers fixed in this branch + +### P0-1: Make the documented backend server command runnable + +- severity: blocker +- file(s): `build_backend.py`, `uvicorn/__init__.py` +- problem: `uvicorn backend.app.main:app --reload` was not available after editable install, and the server shim did not dispatch app routes. +- why it matters: Local installation/run and HTTP smoke tests were blocked. +- recommended fix: Done. Keep the console entry point and route-dispatching shim covered by smoke tests. +- verification command: `pip install -e .[dev] && uvicorn backend.app.main:app --reload --port 8002`. + +### P0-2: Mount required non-`/api` routes + +- severity: blocker +- file(s): `backend/app/main.py` +- problem: Required audit endpoints under `/taxonomy`, `/taxonomy-workbench`, and `/settings` were missing at root. +- why it matters: Backend audit commands failed even though `/api/*` routes existed. +- recommended fix: Done. Maintain both root and `/api` aliases unless the API contract is revised. +- verification command: `curl -fsS http://127.0.0.1:8002/settings/model-providers`. + +## P1 / should fix before release + +### P1-1: Decide and implement active taxonomy semantics + +- severity: high +- file(s): `backend/app/core/paths.py`, `backend/app/services/taxonomy_service.py`, `backend/app/services/taxonomy_workbench_service.py` +- problem: The active API/dashboard taxonomy is `starter-pack.yaml` only, while the repository contains a much larger taxonomy. +- why it matters: Operators cannot audit or use the large taxonomy through the dashboard unless they import it into the starter path. +- recommended fix: Introduce an explicit active-taxonomy config: `starter`, `all_packs_active_enabled`, or `imported_workbook`. Add tests for each mode. +- verification command: `curl -fsS http://127.0.0.1:8002/taxonomy-workbench/coverage | python -m json.tool`. + +### P1-2: Make active taxonomy quality pass or deactivate weak entries + +- severity: high +- file(s): `data/taxonomy/packs/starter-pack.yaml`, `engine/argument_risk_engine/taxonomy/validator.py` +- problem: Active entries fail validation for missing negative examples, minimum evidence, and false-positive warnings. +- why it matters: Quality gates do not protect users from known weak entries. +- recommended fix: Complete each active entry or set it to `review_required` / `enabled_for_classification: false`. +- verification command: `curl -fsS http://127.0.0.1:8002/taxonomy-workbench/quality-report | python -m json.tool`. + +### P1-3: Reduce deterministic false positives on hard negatives + +- severity: high +- file(s): `engine/argument_risk_engine/classification/deterministic.py`, `engine/argument_risk_engine/scoring/scorer.py`, `data/benchmarks/mini_eval_set.jsonl` +- problem: The mini benchmark reported a 0.5556 false-positive rate. +- why it matters: Conservative behavior is a release requirement. +- recommended fix: Add stronger evidence gates, hard-negative exclusions, and calibrated suppression for high-sensitivity entries. +- verification command: `curl -fsS -H 'Content-Type: application/json' -d '{}' http://127.0.0.1:8002/evaluation/run | python -m json.tool`. + +### P1-4: Remove hidden provider-mode ambiguity in analysis + +- severity: high +- file(s): `engine/argument_risk_engine/analyzer.py`, `backend/app/services/analyzer_service.py`, `engine/argument_risk_engine/classification/classifier.py` +- problem: Provider settings can be selected/tested, but `analyze_text()` remains deterministic. +- why it matters: Users may believe a selected provider is analyzing text when it is not. +- recommended fix: Wire analyze through `ArgumentRiskClassifier`, or disable/label provider-backed analysis as unavailable. +- verification command: `curl -fsS -H 'Content-Type: application/json' -d '{"text":"Everyone always caused this.","mode":"llm","model_provider_id":"openai_remote"}' http://127.0.0.1:8002/analyze | python -m json.tool`. + +## P2 / important usability and security hardening + +### P2-1: Add browser automation smoke tests + +- severity: medium +- file(s): `frontend/src/runtime-dashboard.js`, `frontend/scripts/dev_server.mjs` +- problem: Chrome was not available in this environment; no browser automation exists. +- why it matters: Build success does not prove Analyze, imports, exports, review saves, and downloads work in Chrome. +- recommended fix: Add Playwright tests for the required frontend flows. +- verification command: future `npx playwright test`. + +### P2-2: Consolidate frontend implementation path + +- severity: medium +- file(s): `frontend/index.html`, `frontend/src/runtime-dashboard.js`, `frontend/src/App.tsx` +- problem: Served app uses runtime JS, not the React component tree. +- why it matters: Maintenance changes can land in the wrong UI implementation. +- recommended fix: Serve/build the React app or remove stale React components. +- verification command: `curl -fsS http://127.0.0.1:5173/app.js | head -5`. + +### P2-3: Avoid remote provider test network calls when API key is missing + +- severity: medium +- file(s): `engine/argument_risk_engine/classification/llm_client.py` +- problem: Remote provider tests warn about missing keys but still attempt network calls. +- why it matters: Local audits should avoid surprising external traffic. +- recommended fix: Short-circuit remote tests when the configured secret env var is absent. +- verification command: `python - <<'PY'\nfrom backend.app.services.settings_service import test_model_provider\nprint(test_model_provider('openai_remote').model_dump())\nPY`. diff --git a/TAXONOMY_IMPORT_EXPORT_REPORT.md b/TAXONOMY_IMPORT_EXPORT_REPORT.md new file mode 100644 index 0000000..3b04a25 --- /dev/null +++ b/TAXONOMY_IMPORT_EXPORT_REPORT.md @@ -0,0 +1,53 @@ +# Taxonomy Import/Export Report + +## Scope + +This report covers taxonomy IDs, pack validity, workbook import/export, source refs, healthy/deprecated/backlog exclusions, active/enabled classification filtering, and high false-positive sensitivity behavior. + +## Verification summary + +- `python scripts/export_taxonomy_excel.py /tmp/are-taxonomy-audit.xlsx` passed and produced a workbook file. +- Python `import_taxonomy_excel('/tmp/are-taxonomy-audit.xlsx')` loaded 1,103 workbook entries. +- Python `import_workbook('/tmp/are-taxonomy-audit.xlsx', temp_root)` completed mechanically with 1,103 entries and 49 active classification entries, but returned 9 validation errors and 23 warnings. +- `load_all_packs()` found 1,103 entries and no duplicate IDs. +- Healthy reasoning patterns had 0 entries enabled for classification. +- Deprecated entries had 0 active/enabled classification entries. +- Backlog exists as `data/taxonomy/candidate_backlog.yaml` and is not loaded by `load_all_packs()`. + +## Issues + +### TAX-001 — Active dashboard taxonomy is not the full pack set + +- severity: high +- file(s): `backend/app/core/paths.py`, `backend/app/services/taxonomy_service.py`, `data/taxonomy/packs/starter-pack.yaml` +- problem: The dashboard/API active taxonomy points to only `starter-pack.yaml`; full pack import/export scripts handle 1,103 entries. +- why it matters: Workbench coverage/export from the dashboard can mislead users into thinking only 3 taxonomy entries exist. +- recommended fix: Add explicit active taxonomy configuration and label the dashboard coverage as starter-only if that remains the intended default. +- verification command: `curl -fsS http://127.0.0.1:8002/taxonomy-workbench/coverage | python -m json.tool`. + +### TAX-002 — Workbook import/export works mechanically but validation fails + +- severity: high +- file(s): `engine/argument_risk_engine/taxonomy/importer.py`, `engine/argument_risk_engine/taxonomy/exporter.py`, `engine/argument_risk_engine/taxonomy/validator.py`, `data/taxonomy/packs/starter-pack.yaml` +- problem: Round-trip import/export works, but validation reports active starter entries missing negative examples, minimum evidence requirements, and false-positive warnings. +- why it matters: A mechanically importable taxonomy can still be unsafe for classification. +- recommended fix: Treat validation failure as a release blocker for entries enabled for classification. +- verification command: `python scripts/export_taxonomy_excel.py /tmp/are-taxonomy-audit.xlsx && python - <<'PY' ... import_workbook(... temp_root) ... PY`. + +### TAX-003 — Source refs are present in large packs but starter active entries are sparse + +- severity: medium +- file(s): `data/taxonomy/packs/core_mvp.yaml`, `data/taxonomy/packs/starter-pack.yaml`, `engine/argument_risk_engine/taxonomy/source_registry.py` +- problem: Large-pack entries include source refs, while starter-pack entries are legacy/sparse and quality checks flag missing supporting metadata. +- why it matters: Active findings should be traceable to source refs or clear operational definitions. +- recommended fix: Migrate starter active entries to the v0.2 schema quality level or retire the starter pack as the dashboard default. +- verification command: `python - <<'PY'\nfrom argument_risk_engine.taxonomy.pack_manager import load_all_packs\nprint(load_all_packs().entries[0].source_refs)\nPY`. + +### TAX-004 — Healthy/deprecated/backlog exclusions pass in code inspection but need API tests + +- severity: medium +- file(s): `engine/argument_risk_engine/taxonomy/pack_manager.py`, `engine/argument_risk_engine/classification/classifier.py`, `data/taxonomy/candidate_backlog.yaml` +- problem: Code excludes healthy patterns and requires active/enabled entries, but tests should explicitly cover API/dashboard classification against full packs. +- why it matters: Future taxonomy imports could accidentally enable healthy or backlog entries for final risks. +- recommended fix: Add tests asserting healthy entries appear only as suppressors, deprecated/review/backlog entries never become final risks, and active/enabled is required. +- verification command: `python - <<'PY'\nfrom argument_risk_engine.taxonomy.pack_manager import load_all_packs, active_classification_entries\nprint(len(active_classification_entries(load_all_packs())))\nPY`. diff --git a/backend/app/main.py b/backend/app/main.py index 6986d87..5074d1c 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -23,8 +23,11 @@ ) app.include_router(routes_analysis.router) +app.include_router(routes_taxonomy.router) +app.include_router(routes_taxonomy_workbench.router) app.include_router(routes_review.router) app.include_router(routes_evaluation.router) +app.include_router(routes_settings.router) app.include_router(routes_reports.router) app.include_router(routes_analysis.router, prefix="/api") app.include_router(routes_taxonomy.router, prefix="/api") diff --git a/build_backend.py b/build_backend.py index 2d41756..4018aae 100644 --- a/build_backend.py +++ b/build_backend.py @@ -22,6 +22,14 @@ def _metadata() -> str: ]) +def _entry_points() -> str: + return "\n".join([ + "[console_scripts]", + "uvicorn=uvicorn.main:main", + "", + ]) + + def _wheel() -> str: return "\n".join([ "Wheel-Version: 1.0", @@ -76,6 +84,7 @@ def _write_wheel(out_dir: Path, editable: bool) -> str: files: dict[str, bytes] = { f"{DIST}/METADATA": _metadata().encode(), f"{DIST}/WHEEL": _wheel().encode(), + f"{DIST}/entry_points.txt": _entry_points().encode(), } if editable: files["argument_risk_engine_editable.pth"] = f"{ROOT}\n{ROOT / 'engine'}\n".encode() diff --git a/engine/argument_risk_engine/reports/html.py b/engine/argument_risk_engine/reports/html.py index c64906d..66c2261 100644 --- a/engine/argument_risk_engine/reports/html.py +++ b/engine/argument_risk_engine/reports/html.py @@ -3,6 +3,8 @@ from html import escape from typing import Any +from argument_risk_engine.reports.json_export import LIMITATIONS_NOTE + def render_html_report(result: dict[str, Any]) -> str: claims = result.get("claims", []) or [] @@ -16,7 +18,7 @@ def render_html_report(result: dict[str, Any]) -> str:

Analysis ID: {escape(str(result.get('analysis_id') or result.get('text_id', 'unknown')))}

Overall risk score: {escape(str(result.get('overall_risk_score', 0)))}

Risk level: {escape(str(result.get('risk_level', 'unknown')))}

-

Metrics and reports are review aids only and do not claim scientific validation.

+

{escape(LIMITATIONS_NOTE)}

Summary