From 8b723e4042057adacb421a3c2cdf355b9abf8136 Mon Sep 17 00:00:00 2001 From: Spbd1 <148923621+Spbd1@users.noreply.github.com> Date: Mon, 18 May 2026 06:10:00 +0000 Subject: [PATCH] Build review evaluation and report workflows --- backend/app/api/routes_evaluation.py | 23 +++++- backend/app/api/routes_reports.py | 42 +++++++++- backend/app/api/routes_review.py | 26 +++++- backend/app/main.py | 3 + backend/app/schemas/evaluation.py | 15 ++++ backend/app/schemas/reports.py | 29 ++++++- backend/app/schemas/review.py | 37 +++++++++ backend/app/services/evaluation_service.py | 37 ++++++++- backend/app/services/report_service.py | 82 +++++++++++++++++++ backend/app/services/review_service.py | 31 ++++++- data/benchmarks/mini_eval_set.jsonl | 3 +- .../evaluation/metrics.py | 82 +++++++++++++++++++ .../argument_risk_engine/evaluation/runner.py | 58 +++++++++++-- engine/argument_risk_engine/reports/html.py | 47 ++++++++++- .../reports/json_export.py | 7 +- .../argument_risk_engine/reports/markdown.py | 47 ++++++++++- engine/argument_risk_engine/review/models.py | 63 +++++++++++++- engine/argument_risk_engine/review/store.py | 59 ++++++++++++- scripts/run_evaluation.py | 32 +++++++- tests/test_evaluation.py | 30 ++++++- tests/test_review_reports_api.py | 53 ++++++++++++ 21 files changed, 773 insertions(+), 33 deletions(-) create mode 100644 tests/test_review_reports_api.py diff --git a/backend/app/api/routes_evaluation.py b/backend/app/api/routes_evaluation.py index 41be172..caf28e9 100644 --- a/backend/app/api/routes_evaluation.py +++ b/backend/app/api/routes_evaluation.py @@ -1,8 +1,27 @@ -from backend.app.services.evaluation_service import evaluate +from __future__ import annotations + +from backend.app.schemas.evaluation import EvaluationRunRequest +from backend.app.services.evaluation_service import evaluate, evaluation_errors, evaluation_summary from fastapi import APIRouter router = APIRouter(prefix="/evaluation", tags=["evaluation"]) + +@router.post("/run") +def run_evaluation(payload: EvaluationRunRequest) -> dict[str, object]: + return evaluate(payload.benchmark_path) + + @router.get("/run") -def run_evaluation() -> dict[str, object]: +def run_evaluation_legacy() -> dict[str, object]: return evaluate() + + +@router.get("/summary") +def summary() -> dict[str, object]: + return evaluation_summary() + + +@router.get("/errors") +def errors() -> dict[str, object]: + return evaluation_errors() diff --git a/backend/app/api/routes_reports.py b/backend/app/api/routes_reports.py index bab17e6..5327f7f 100644 --- a/backend/app/api/routes_reports.py +++ b/backend/app/api/routes_reports.py @@ -1,8 +1,48 @@ -from backend.app.services.report_service import demo_report +from __future__ import annotations + +from backend.app.schemas.reports import ReportFromAnalysisRequest +from backend.app.services.report_service import ( + create_report_from_analysis, + demo_report, + get_report, + get_report_content, + list_reports, +) from fastapi import APIRouter, Response router = APIRouter(prefix="/reports", tags=["reports"]) + +@router.post("/from-analysis") +def from_analysis(payload: ReportFromAnalysisRequest) -> dict[str, object]: + return create_report_from_analysis(payload.analysis, payload.title, payload.formats) + + +@router.get("") +def reports() -> list[dict[str, object]]: + return list_reports() + + +@router.get("/") +def reports_slash() -> list[dict[str, object]]: + return list_reports() + + +@router.get("/{report_id}") +def report_detail(report_id: str) -> dict[str, object]: + report = get_report(report_id) + return report or {"detail": "not found"} + + +@router.get("/{report_id}/download") +def report_download(report_id: str, format: str = "markdown") -> Response: + content = get_report_content(report_id, format) + if content is None: + return Response(content="Report or format not found", media_type="text/plain", status_code=404) + body, media_type, filename = content + return Response(content=body, media_type=media_type, headers={"content-disposition": f'attachment; filename="{filename}"'}) + + @router.get("/demo.md") def report_demo() -> Response: return Response(content=demo_report(), media_type="text/markdown") diff --git a/backend/app/api/routes_review.py b/backend/app/api/routes_review.py index 6630ba9..661e01a 100644 --- a/backend/app/api/routes_review.py +++ b/backend/app/api/routes_review.py @@ -1,10 +1,34 @@ +from __future__ import annotations + from argument_risk_engine.review.models import ReviewFeedback -from backend.app.services.review_service import record_feedback +from backend.app.schemas.review import ReviewItemRequest +from backend.app.services.review_service import ( + create_review_item, + get_review_summary, + list_review_items, + record_feedback, +) from fastapi import APIRouter router = APIRouter(prefix="/review", tags=["review"]) + +@router.get("/items") +def items() -> list[dict[str, object]]: + return list_review_items() + + +@router.post("/items") +def create_item(payload: ReviewItemRequest) -> dict[str, object]: + return create_review_item(payload) + + +@router.get("/summary") +def summary() -> dict[str, object]: + return get_review_summary() + + @router.post("/feedback") def feedback(payload: ReviewFeedback) -> dict[str, str]: return record_feedback(payload) diff --git a/backend/app/main.py b/backend/app/main.py index b485a69..6986d87 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -23,6 +23,9 @@ ) app.include_router(routes_analysis.router) +app.include_router(routes_review.router) +app.include_router(routes_evaluation.router) +app.include_router(routes_reports.router) app.include_router(routes_analysis.router, prefix="/api") app.include_router(routes_taxonomy.router, prefix="/api") app.include_router(routes_taxonomy_workbench.router, prefix="/api") diff --git a/backend/app/schemas/evaluation.py b/backend/app/schemas/evaluation.py index 7d63e3c..fb16f83 100644 --- a/backend/app/schemas/evaluation.py +++ b/backend/app/schemas/evaluation.py @@ -1,5 +1,20 @@ +from __future__ import annotations + +from typing import Any + from pydantic import BaseModel +class EvaluationRunRequest(BaseModel): + benchmark_path: str | None = None + + class EvaluationResponse(BaseModel): items: int + metrics: dict[str, float] + errors: dict[str, list[dict[str, Any]]] + false_positives: list[dict[str, Any]] + false_negatives: list[dict[str, Any]] + evidence_span_misses: list[dict[str, Any]] + analyses: list[dict[str, Any]] + disclaimer: str diff --git a/backend/app/schemas/reports.py b/backend/app/schemas/reports.py index 42acef1..02ef79e 100644 --- a/backend/app/schemas/reports.py +++ b/backend/app/schemas/reports.py @@ -1,5 +1,30 @@ -from pydantic import BaseModel +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + + +class ReportFromAnalysisRequest(BaseModel): + analysis: dict[str, Any] + title: str = "Argument Risk Report" + formats: list[str] = Field(default_factory=lambda: ["json", "markdown", "html"]) + + +class ReportSummary(BaseModel): + report_id: str + title: str + created_at: str + analysis_id: str + formats: list[str] class ReportResponse(BaseModel): - content: str + report_id: str + title: str + created_at: str + analysis_id: str + formats: list[str] + json: str | None = None + markdown: str | None = None + html: str | None = None diff --git a/backend/app/schemas/review.py b/backend/app/schemas/review.py index e69de29..3564ed1 100644 --- a/backend/app/schemas/review.py +++ b/backend/app/schemas/review.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + + +class ReviewItemRequest(BaseModel): + review_id: str | None = None + text_id: str + claim_id: str + claim_text: str + predicted_risks: list[dict[str, Any]] = Field(default_factory=list) + reviewer_decision: str + corrected_labels: list[str] = Field(default_factory=list) + corrected_evidence_spans: list[str] = Field(default_factory=list) + reviewer_notes: str = "" + + +class ReviewItemResponse(BaseModel): + review_id: str + text_id: str + claim_id: str + claim_text: str + predicted_risks: list[dict[str, Any]] + reviewer_decision: str + corrected_labels: list[str] + corrected_evidence_spans: list[str] + reviewer_notes: str + created_at: str + + +class ReviewSummaryResponse(BaseModel): + total_reviews: int + by_decision: dict[str, int] + corrected_label_counts: dict[str, int] + store_path: str diff --git a/backend/app/services/evaluation_service.py b/backend/app/services/evaluation_service.py index 2fdd90a..619ff73 100644 --- a/backend/app/services/evaluation_service.py +++ b/backend/app/services/evaluation_service.py @@ -1,7 +1,40 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + from argument_risk_engine.evaluation.runner import run_evaluation from backend.app.core.paths import DATA_DIR +BENCHMARK_PATH = DATA_DIR / "benchmarks" / "mini_eval_set.jsonl" +EVALUATION_RESULT_PATH = DATA_DIR / "evaluation" / "last_evaluation.json" + + +def evaluate(benchmark_path: str | None = None) -> dict[str, Any]: + path = Path(benchmark_path) if benchmark_path else BENCHMARK_PATH + result = run_evaluation(path) + EVALUATION_RESULT_PATH.parent.mkdir(parents=True, exist_ok=True) + EVALUATION_RESULT_PATH.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8") + return result + + +def evaluation_summary() -> dict[str, Any]: + result = _load_or_run() + return { + "items": result.get("items", 0), + "metrics": result.get("metrics", {}), + "disclaimer": result.get("disclaimer", ""), + } + + +def evaluation_errors() -> dict[str, Any]: + result = _load_or_run() + return result.get("errors", {"false_positives": [], "false_negatives": [], "evidence_span_misses": []}) + -def evaluate() -> dict[str, object]: - return run_evaluation(DATA_DIR / "benchmarks" / "mini_eval_set.jsonl") +def _load_or_run() -> dict[str, Any]: + if EVALUATION_RESULT_PATH.exists(): + return json.loads(EVALUATION_RESULT_PATH.read_text(encoding="utf-8")) + return evaluate() diff --git a/backend/app/services/report_service.py b/backend/app/services/report_service.py index 3e685a6..5b568f8 100644 --- a/backend/app/services/report_service.py +++ b/backend/app/services/report_service.py @@ -1,7 +1,89 @@ +from __future__ import annotations + +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any +from uuid import uuid4 + +from argument_risk_engine.reports.html import render_html_report +from argument_risk_engine.reports.json_export import render_json_report from argument_risk_engine.reports.markdown import render_markdown_report +from backend.app.core.paths import REPORTS_DIR from backend.app.services.analyzer_service import analyze +INDEX_PATH = REPORTS_DIR / "reports_index.json" +SUPPORTED_FORMATS = {"json", "markdown", "html"} + + +def create_report_from_analysis(analysis: dict[str, Any], title: str = "Argument Risk Report", formats: list[str] | None = None) -> dict[str, Any]: + requested = [fmt for fmt in (formats or ["json", "markdown", "html"]) if fmt in SUPPORTED_FORMATS] + if not requested: + requested = ["json"] + report_id = f"rpt_{uuid4().hex[:12]}" + created_at = datetime.now(timezone.utc).isoformat() + analysis_id = str(analysis.get("analysis_id") or analysis.get("text_id") or "unknown") + payload: dict[str, Any] = { + "report_id": report_id, + "title": title, + "created_at": created_at, + "analysis_id": analysis_id, + "formats": requested, + } + if "json" in requested: + payload["json"] = render_json_report(analysis) + if "markdown" in requested: + payload["markdown"] = render_markdown_report(analysis) + if "html" in requested: + payload["html"] = render_html_report(analysis) + _write_report(payload) + _append_index({key: payload[key] for key in ("report_id", "title", "created_at", "analysis_id", "formats")}) + return payload + + +def list_reports() -> list[dict[str, Any]]: + return _read_index() + + +def get_report(report_id: str) -> dict[str, Any] | None: + path = _report_path(report_id) + if not path.exists(): + return None + return json.loads(path.read_text(encoding="utf-8")) + + +def get_report_content(report_id: str, report_format: str) -> tuple[str, str, str] | None: + report = get_report(report_id) + if not report or report_format not in SUPPORTED_FORMATS or not report.get(report_format): + return None + extension = "md" if report_format == "markdown" else report_format + media_type = {"json": "application/json", "markdown": "text/markdown", "html": "text/html"}[report_format] + return str(report[report_format]), media_type, f"{report_id}.{extension}" + def demo_report() -> str: return render_markdown_report(analyze("Everyone always caused this problem.")) + + +def _write_report(report: dict[str, Any]) -> None: + REPORTS_DIR.mkdir(parents=True, exist_ok=True) + _report_path(str(report["report_id"])).write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8") + + +def _report_path(report_id: str) -> Path: + safe = "".join(char for char in report_id if char.isalnum() or char in {"_", "-"}) + return REPORTS_DIR / f"{safe}.json" + + +def _read_index() -> list[dict[str, Any]]: + if not INDEX_PATH.exists(): + return [] + return json.loads(INDEX_PATH.read_text(encoding="utf-8")) + + +def _append_index(summary: dict[str, Any]) -> None: + REPORTS_DIR.mkdir(parents=True, exist_ok=True) + records = [item for item in _read_index() if item.get("report_id") != summary["report_id"]] + records.insert(0, summary) + INDEX_PATH.write_text(json.dumps(records, indent=2, ensure_ascii=False), encoding="utf-8") diff --git a/backend/app/services/review_service.py b/backend/app/services/review_service.py index 387f6ee..b466264 100644 --- a/backend/app/services/review_service.py +++ b/backend/app/services/review_service.py @@ -1,7 +1,34 @@ -from argument_risk_engine.review.models import ReviewFeedback -from argument_risk_engine.review.store import append_feedback +from __future__ import annotations + +from argument_risk_engine.review.models import ReviewFeedback, ReviewItem +from argument_risk_engine.review.store import ( + append_feedback, + append_review_item, + read_review_items, + review_summary, +) from backend.app.core.paths import REVIEW_STORE_PATH +from backend.app.schemas.review import ReviewItemRequest + + +def list_review_items() -> list[dict[str, object]]: + return [item.model_dump() for item in read_review_items(REVIEW_STORE_PATH)] + + +def create_review_item(payload: ReviewItemRequest | ReviewItem) -> dict[str, object]: + if isinstance(payload, ReviewItem): + item = payload + else: + data = payload.model_dump() + if not data.get("review_id"): + data.pop("review_id", None) + item = ReviewItem(**data) + return append_review_item(REVIEW_STORE_PATH, item).model_dump() + + +def get_review_summary() -> dict[str, object]: + return review_summary(REVIEW_STORE_PATH) def record_feedback(feedback: ReviewFeedback) -> dict[str, str]: diff --git a/data/benchmarks/mini_eval_set.jsonl b/data/benchmarks/mini_eval_set.jsonl index 93c1695..93f28e1 100644 --- a/data/benchmarks/mini_eval_set.jsonl +++ b/data/benchmarks/mini_eval_set.jsonl @@ -1 +1,2 @@ -{"text":"They are vermin.","expected":["dehumanizing_language"]} +{"id":"sample_001","text":"They are vermin.","gold_labels":["dehumanizing_language"],"gold_evidence_spans":["vermin"],"difficulty":"easy","notes":"Simple dehumanizing-language smoke test for MVP evaluation."} +{"id":"sample_002","text":"This proposal has tradeoffs and the evidence is mixed.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Neutral reasoning sample used to monitor over-classification."} diff --git a/engine/argument_risk_engine/evaluation/metrics.py b/engine/argument_risk_engine/evaluation/metrics.py index e498aa4..b6acb14 100644 --- a/engine/argument_risk_engine/evaluation/metrics.py +++ b/engine/argument_risk_engine/evaluation/metrics.py @@ -1,2 +1,84 @@ +from __future__ import annotations + +from typing import Any + + def precision(tp: int, fp: int) -> float: return tp / (tp + fp) if tp + fp else 0.0 + + +def recall(tp: int, fn: int) -> float: + return tp / (tp + fn) if tp + fn else 0.0 + + +def f1_score(label_precision: float, label_recall: float) -> float: + return 2 * label_precision * label_recall / (label_precision + label_recall) if label_precision + label_recall else 0.0 + + +def partial_span_match(predicted: str, gold: str) -> bool: + predicted_norm = predicted.strip().lower() + gold_norm = gold.strip().lower() + return bool(predicted_norm and gold_norm and (predicted_norm in gold_norm or gold_norm in predicted_norm)) + + +def compute_metrics(rows: list[dict[str, Any]], analyses: list[dict[str, Any]]) -> dict[str, float]: + tp = fp = fn = 0 + exact_matches = partial_matches = span_cases = 0 + review_count = over_classified = no_finding = 0 + + for row, analysis in zip(rows, analyses, strict=False): + gold_labels = set(_gold_labels(row)) + predicted_labels = set(_predicted_labels(analysis)) + tp += len(predicted_labels & gold_labels) + fp += len(predicted_labels - gold_labels) + fn += len(gold_labels - predicted_labels) + + if analysis.get("needs_human_review"): + review_count += 1 + if len(predicted_labels) > len(gold_labels): + over_classified += 1 + if not predicted_labels: + no_finding += 1 + + gold_spans = _gold_spans(row) + predicted_spans = _predicted_spans(analysis) + if gold_spans: + span_cases += 1 + if any(pred == gold for pred in predicted_spans for gold in gold_spans): + exact_matches += 1 + if any(partial_span_match(pred, gold) for pred in predicted_spans for gold in gold_spans): + partial_matches += 1 + + label_precision = precision(tp, fp) + label_recall = recall(tp, fn) + total = len(rows) + return { + "label_precision": round(label_precision, 4), + "label_recall": round(label_recall, 4), + "label_f1": round(f1_score(label_precision, label_recall), 4), + "false_positive_rate": round(fp / (fp + tp) if fp + tp else 0.0, 4), + "evidence_span_exact_match": round(exact_matches / span_cases if span_cases else 0.0, 4), + "evidence_span_partial_match": round(partial_matches / span_cases if span_cases else 0.0, 4), + "human_review_rate": round(review_count / total if total else 0.0, 4), + "over_classification_rate": round(over_classified / total if total else 0.0, 4), + "no_finding_rate": round(no_finding / total if total else 0.0, 4), + } + + +def _gold_labels(row: dict[str, Any]) -> list[str]: + return [str(label) for label in row.get("gold_labels", row.get("expected", []))] + + +def _gold_spans(row: dict[str, Any]) -> list[str]: + return [str(span) for span in row.get("gold_evidence_spans", [])] + + +def _predicted_labels(analysis: dict[str, Any]) -> list[str]: + labels: list[str] = [] + for risk in analysis.get("risks", []): + labels.append(str(risk.get("risk_id") or risk.get("label") or "")) + return [label for label in labels if label] + + +def _predicted_spans(analysis: dict[str, Any]) -> list[str]: + return [str(risk.get("evidence_span") or "") for risk in analysis.get("risks", []) if risk.get("evidence_span")] diff --git a/engine/argument_risk_engine/evaluation/runner.py b/engine/argument_risk_engine/evaluation/runner.py index 84290e4..43dccb7 100644 --- a/engine/argument_risk_engine/evaluation/runner.py +++ b/engine/argument_risk_engine/evaluation/runner.py @@ -2,14 +2,60 @@ import json from pathlib import Path +from typing import Any from argument_risk_engine.analyzer import analyze_text +from argument_risk_engine.evaluation.metrics import compute_metrics +DISCLAIMER = ( + "MVP evaluation metrics are operational QA indicators for this benchmark only; " + "they do not establish scientific or clinical validation." +) -def run_evaluation(path: Path | str) -> dict[str, object]: - rows = [] - p = Path(path) - if p.exists(): - rows = [json.loads(line) for line in p.read_text().splitlines() if line.strip()] + +def run_evaluation(path: Path | str) -> dict[str, Any]: + rows = load_benchmark(path) analyses = [analyze_text(row.get("text", "")) for row in rows] - return {"items": len(rows), "analyses": analyses} + metrics = compute_metrics(rows, analyses) + errors = collect_errors(rows, analyses) + return { + "items": len(rows), + "metrics": metrics, + "errors": errors, + "false_positives": errors["false_positives"], + "false_negatives": errors["false_negatives"], + "evidence_span_misses": errors["evidence_span_misses"], + "analyses": analyses, + "disclaimer": DISCLAIMER, + } + + +def load_benchmark(path: Path | str) -> list[dict[str, Any]]: + p = Path(path) + if not p.exists(): + return [] + return [json.loads(line) for line in p.read_text(encoding="utf-8").splitlines() if line.strip()] + + +def collect_errors(rows: list[dict[str, Any]], analyses: list[dict[str, Any]]) -> dict[str, list[dict[str, Any]]]: + false_positives: list[dict[str, Any]] = [] + false_negatives: list[dict[str, Any]] = [] + evidence_span_misses: list[dict[str, Any]] = [] + for row, analysis in zip(rows, analyses, strict=False): + row_id = str(row.get("id") or row.get("text_id") or "unknown") + gold_labels = {str(label) for label in row.get("gold_labels", row.get("expected", []))} + predicted_labels = {str(risk.get("risk_id") or risk.get("label") or "") for risk in analysis.get("risks", []) if risk} + for label in sorted(predicted_labels - gold_labels): + false_positives.append({"id": row_id, "label": label, "text": row.get("text", "")}) + for label in sorted(gold_labels - predicted_labels): + false_negatives.append({"id": row_id, "label": label, "text": row.get("text", "")}) + gold_spans = [str(span) for span in row.get("gold_evidence_spans", [])] + if gold_spans: + predicted_spans = [str(risk.get("evidence_span") or "") for risk in analysis.get("risks", [])] + if not any(pred == gold for pred in predicted_spans for gold in gold_spans): + evidence_span_misses.append({"id": row_id, "gold_spans": gold_spans, "predicted_spans": predicted_spans}) + return { + "false_positives": false_positives, + "false_negatives": false_negatives, + "evidence_span_misses": evidence_span_misses, + } diff --git a/engine/argument_risk_engine/reports/html.py b/engine/argument_risk_engine/reports/html.py index f9b1764..c64906d 100644 --- a/engine/argument_risk_engine/reports/html.py +++ b/engine/argument_risk_engine/reports/html.py @@ -1,2 +1,45 @@ -def render_html_report(result): - return "

Argument Risk Report

" +from __future__ import annotations + +from html import escape +from typing import Any + + +def render_html_report(result: dict[str, Any]) -> str: + claims = result.get("claims", []) or [] + risks = result.get("risks", []) or [risk for claim in claims for risk in claim.get("detected_risks", [])] + findings = "".join(_risk_html(risk) for risk in risks) or "

No risk findings were detected by the current analysis configuration.

" + return f""" + +Argument Risk Report + +

Argument Risk Report

+

Analysis ID: {escape(str(result.get('analysis_id') or result.get('text_id', 'unknown')))}

+

Overall risk score: {escape(str(result.get('overall_risk_score', 0)))}

+

Risk level: {escape(str(result.get('risk_level', 'unknown')))}

+

Metrics and reports are review aids only and do not claim scientific validation.

+

Summary

+ +

Findings

+ {findings} + + +""" + + +def _risk_html(risk: dict[str, Any]) -> str: + return f"""
+

{escape(str(risk.get('label', risk.get('risk_id', 'Finding'))))}

+ +

{escape(str(risk.get('explanation', '')))}

+
""" diff --git a/engine/argument_risk_engine/reports/json_export.py b/engine/argument_risk_engine/reports/json_export.py index 5555b41..83747b3 100644 --- a/engine/argument_risk_engine/reports/json_export.py +++ b/engine/argument_risk_engine/reports/json_export.py @@ -1,5 +1,8 @@ +from __future__ import annotations + import json +from typing import Any -def render_json_report(result): - return json.dumps(result, indent=2) +def render_json_report(result: dict[str, Any]) -> str: + return json.dumps(result, indent=2, ensure_ascii=False, sort_keys=True) diff --git a/engine/argument_risk_engine/reports/markdown.py b/engine/argument_risk_engine/reports/markdown.py index 07203cc..5d03ebc 100644 --- a/engine/argument_risk_engine/reports/markdown.py +++ b/engine/argument_risk_engine/reports/markdown.py @@ -1,2 +1,45 @@ -def render_markdown_report(result: dict[str, object]) -> str: - return f"# Argument Risk Report\n\nAnalysis: {result.get('analysis_id')}\n\nRisks: {len(result.get('risks', []))}\n" +from __future__ import annotations + +from typing import Any + + +def render_markdown_report(result: dict[str, Any]) -> str: + claims = result.get("claims", []) or [] + risks = result.get("risks", []) or [risk for claim in claims for risk in claim.get("detected_risks", [])] + lines = [ + "# Argument Risk Report", + "", + f"Analysis ID: `{result.get('analysis_id') or result.get('text_id', 'unknown')}`", + f"Overall risk score: **{result.get('overall_risk_score', 0)}**", + f"Risk level: **{result.get('risk_level', 'unknown')}**", + "", + "> Metrics and reports are review aids only and do not claim scientific validation.", + "", + "## Summary", + "", + f"- Claims reviewed: {len(claims)}", + f"- Detected risks: {len(risks)}", + f"- Needs human review: {bool(result.get('needs_human_review', False))}", + "", + "## Findings", + "", + ] + if not risks: + lines.append("No risk findings were detected by the current analysis configuration.") + for risk in risks: + lines.extend( + [ + f"### {risk.get('label', risk.get('risk_id', 'Finding'))}", + "", + f"- Risk ID: `{risk.get('risk_id', '')}`", + f"- Category: {risk.get('category', '')}", + f"- Severity: {risk.get('severity', '')}", + f"- Confidence: {risk.get('confidence', 0)}", + f"- Evidence: {risk.get('evidence_span', '')}", + f"- Human review recommended: {bool(risk.get('needs_human_review', False))}", + "", + str(risk.get("explanation", "")), + "", + ] + ) + return "\n".join(lines).strip() + "\n" diff --git a/engine/argument_risk_engine/review/models.py b/engine/argument_risk_engine/review/models.py index 4adc52b..2658882 100644 --- a/engine/argument_risk_engine/review/models.py +++ b/engine/argument_risk_engine/review/models.py @@ -1,8 +1,69 @@ -from pydantic import BaseModel +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any, Literal +from uuid import uuid4 + +from pydantic import BaseModel, Field + +ReviewDecision = Literal[ + "correct", + "incorrect", + "partially_correct", + "insufficient_evidence", + "unclear", +] + +VALID_REVIEW_DECISIONS = { + "correct", + "incorrect", + "partially_correct", + "insufficient_evidence", + "unclear", +} + + +class ReviewValidationError(ValueError): + """Raised when a review record is incomplete or invalid.""" + + +class ReviewItem(BaseModel): + review_id: str = Field(default_factory=lambda: f"rev_{uuid4().hex[:12]}") + text_id: str + claim_id: str + claim_text: str + predicted_risks: list[dict[str, Any]] = Field(default_factory=list) + reviewer_decision: ReviewDecision + corrected_labels: list[str] = Field(default_factory=list) + corrected_evidence_spans: list[str] = Field(default_factory=list) + reviewer_notes: str = "" + created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) + + def validate_item(self) -> None: + validate_review_item(self.model_dump()) class ReviewFeedback(BaseModel): + """Legacy feedback shape kept for older dashboard clients.""" + analysis_id: str taxonomy_id: str | None = None decision: str notes: str = "" + + +def validate_review_item(data: dict[str, Any]) -> None: + required_strings = ["review_id", "text_id", "claim_id", "claim_text", "reviewer_decision", "created_at"] + missing = [field for field in required_strings if not str(data.get(field) or "").strip()] + if missing: + raise ReviewValidationError(f"Review item missing required fields: {', '.join(missing)}") + decision = str(data.get("reviewer_decision") or "") + if decision not in VALID_REVIEW_DECISIONS: + raise ReviewValidationError(f"Unsupported reviewer_decision: {decision}") + for list_field in ("predicted_risks", "corrected_labels", "corrected_evidence_spans"): + if not isinstance(data.get(list_field, []), list): + raise ReviewValidationError(f"{list_field} must be a list") + try: + datetime.fromisoformat(str(data["created_at"]).replace("Z", "+00:00")) + except ValueError as exc: + raise ReviewValidationError("created_at must be an ISO-8601 timestamp") from exc diff --git a/engine/argument_risk_engine/review/store.py b/engine/argument_risk_engine/review/store.py index a7ff17b..1672945 100644 --- a/engine/argument_risk_engine/review/store.py +++ b/engine/argument_risk_engine/review/store.py @@ -2,11 +2,62 @@ import json from pathlib import Path +from typing import Any -from argument_risk_engine.review.models import ReviewFeedback +from argument_risk_engine.review.models import ReviewFeedback, ReviewItem, validate_review_item -def append_feedback(path: Path, feedback: ReviewFeedback) -> None: +def append_review_item(path: Path, item: ReviewItem) -> ReviewItem: + data = item.model_dump() + validate_review_item(data) path.parent.mkdir(parents=True, exist_ok=True) - with path.open("a") as handle: - handle.write(json.dumps(feedback.model_dump()) + "\n") + with path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(data, ensure_ascii=False, sort_keys=True) + "\n") + return item + + +def read_review_items(path: Path) -> list[ReviewItem]: + if not path.exists(): + return [] + items: list[ReviewItem] = [] + for _line_number, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1): + if not line.strip(): + continue + data = json.loads(line) + validate_review_item(data) + items.append(ReviewItem(**data)) + return items + + +def review_summary(path: Path) -> dict[str, Any]: + items = read_review_items(path) + by_decision: dict[str, int] = {} + corrected_label_counts: dict[str, int] = {} + for item in items: + by_decision[item.reviewer_decision] = by_decision.get(item.reviewer_decision, 0) + 1 + for label in item.corrected_labels: + corrected_label_counts[label] = corrected_label_counts.get(label, 0) + 1 + return { + "total_reviews": len(items), + "by_decision": by_decision, + "corrected_label_counts": corrected_label_counts, + "store_path": str(path), + } + + +def append_feedback(path: Path, feedback: ReviewFeedback) -> None: + """Legacy adapter that records old feedback payloads in the append-only review store.""" + + item = ReviewItem( + text_id=feedback.analysis_id, + claim_id=feedback.taxonomy_id or "legacy_feedback", + claim_text="", + predicted_risks=[{"taxonomy_id": feedback.taxonomy_id}] if feedback.taxonomy_id else [], + reviewer_decision=_legacy_decision(feedback.decision), + reviewer_notes=feedback.notes, + ) + append_review_item(path, item) + + +def _legacy_decision(decision: str) -> str: + return {"partial": "partially_correct"}.get(decision, decision) diff --git a/scripts/run_evaluation.py b/scripts/run_evaluation.py index 5c19afd..af5d46a 100644 --- a/scripts/run_evaluation.py +++ b/scripts/run_evaluation.py @@ -1,7 +1,33 @@ +from __future__ import annotations + +import argparse +import json +import sys from pathlib import Path -from argument_risk_engine.evaluation.runner import run_evaluation +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT / "engine")) +sys.path.insert(0, str(ROOT)) + +from argument_risk_engine.evaluation.runner import run_evaluation # noqa: E402,I001 + if __name__ == "__main__": - result = run_evaluation(Path(__file__).resolve().parents[1] / "data/benchmarks/mini_eval_set.jsonl") - print(f"Evaluated {result['items']} items") + parser = argparse.ArgumentParser(description="Run the Argument Risk Engine mini benchmark.") + parser.add_argument( + "benchmark", + nargs="?", + default=str(ROOT / "data/benchmarks/mini_eval_set.jsonl"), + help="Path to a JSONL benchmark file.", + ) + parser.add_argument("--json", action="store_true", help="Print the full evaluation payload as JSON.") + args = parser.parse_args() + + result = run_evaluation(Path(args.benchmark)) + if args.json: + print(json.dumps(result, indent=2, ensure_ascii=False)) + else: + print(f"Evaluated {result['items']} items") + for name, value in result["metrics"].items(): + print(f"{name}: {value}") + print(result["disclaimer"]) diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index 1baefc5..de75528 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -1,7 +1,33 @@ +from __future__ import annotations + +from argument_risk_engine.evaluation.metrics import compute_metrics from argument_risk_engine.evaluation.runner import run_evaluation def test_evaluation_runner(tmp_path): path = tmp_path / "eval.jsonl" - path.write_text('{"text":"They are vermin."}\n') - assert run_evaluation(path)["items"] == 1 + path.write_text('{"id":"sample_001","text":"They are vermin.","gold_labels":["dehumanizing_language"],"gold_evidence_spans":["vermin"],"difficulty":"easy","notes":"smoke"}\n') + result = run_evaluation(path) + + assert result["items"] == 1 + assert "label_precision" in result["metrics"] + assert "do not establish scientific" in result["disclaimer"] + + +def test_metrics_include_required_operational_rates(): + rows = [{"gold_labels": ["a"], "gold_evidence_spans": ["abc"]}] + analyses = [{"risks": [{"risk_id": "a", "evidence_span": "abc"}], "needs_human_review": True}] + + metrics = compute_metrics(rows, analyses) + + assert metrics == { + "label_precision": 1.0, + "label_recall": 1.0, + "label_f1": 1.0, + "false_positive_rate": 0.0, + "evidence_span_exact_match": 1.0, + "evidence_span_partial_match": 1.0, + "human_review_rate": 1.0, + "over_classification_rate": 0.0, + "no_finding_rate": 0.0, + } diff --git a/tests/test_review_reports_api.py b/tests/test_review_reports_api.py new file mode 100644 index 0000000..90fdcf1 --- /dev/null +++ b/tests/test_review_reports_api.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from backend.app.main import app +from fastapi.testclient import TestClient + + +def test_review_items_endpoint_records_item(tmp_path, monkeypatch): + from backend.app.services import review_service + + monkeypatch.setattr(review_service, "REVIEW_STORE_PATH", tmp_path / "review_store.jsonl") + client = TestClient(app) + response = client.post( + "/api/review/items", + json={ + "text_id": "txt_test", + "claim_id": "claim_1", + "claim_text": "They are vermin.", + "predicted_risks": [{"risk_id": "dehumanizing_language"}], + "reviewer_decision": "correct", + "corrected_labels": ["dehumanizing_language"], + "corrected_evidence_spans": ["vermin"], + "reviewer_notes": "ok", + }, + ) + + assert response.status_code == 200 + assert response.json()["review_id"].startswith("rev_") + assert client.get("/api/review/summary").json()["total_reviews"] == 1 + + +def test_evaluation_and_reports_endpoints(tmp_path, monkeypatch): + from backend.app.services import evaluation_service, report_service + + monkeypatch.setattr(evaluation_service, "EVALUATION_RESULT_PATH", tmp_path / "last_evaluation.json") + monkeypatch.setattr(report_service, "REPORTS_DIR", tmp_path / "reports") + monkeypatch.setattr(report_service, "INDEX_PATH", tmp_path / "reports" / "reports_index.json") + client = TestClient(app) + evaluation = client.post("/api/evaluation/run", json={}).json() + + assert evaluation["items"] >= 1 + assert "label_f1" in evaluation["metrics"] + + analysis = evaluation["analyses"][0] + report = client.post( + "/api/reports/from-analysis", + json={"analysis": analysis, "title": "Smoke report", "formats": ["json", "markdown", "html"]}, + ).json() + + assert report["report_id"].startswith("rpt_") + assert "Argument Risk Report" in report["markdown"] + download = client.get(f"/api/reports/{report['report_id']}/download?format=markdown") + assert download.status_code == 200 + assert "Argument Risk Report" in download.content