diff --git a/backend/app/api/routes_evaluation.py b/backend/app/api/routes_evaluation.py index 41be172..caf28e9 100644 --- a/backend/app/api/routes_evaluation.py +++ b/backend/app/api/routes_evaluation.py @@ -1,8 +1,27 @@ -from backend.app.services.evaluation_service import evaluate +from __future__ import annotations + +from backend.app.schemas.evaluation import EvaluationRunRequest +from backend.app.services.evaluation_service import evaluate, evaluation_errors, evaluation_summary from fastapi import APIRouter router = APIRouter(prefix="/evaluation", tags=["evaluation"]) + +@router.post("/run") +def run_evaluation(payload: EvaluationRunRequest) -> dict[str, object]: + return evaluate(payload.benchmark_path) + + @router.get("/run") -def run_evaluation() -> dict[str, object]: +def run_evaluation_legacy() -> dict[str, object]: return evaluate() + + +@router.get("/summary") +def summary() -> dict[str, object]: + return evaluation_summary() + + +@router.get("/errors") +def errors() -> dict[str, object]: + return evaluation_errors() diff --git a/backend/app/api/routes_reports.py b/backend/app/api/routes_reports.py index bab17e6..5327f7f 100644 --- a/backend/app/api/routes_reports.py +++ b/backend/app/api/routes_reports.py @@ -1,8 +1,48 @@ -from backend.app.services.report_service import demo_report +from __future__ import annotations + +from backend.app.schemas.reports import ReportFromAnalysisRequest +from backend.app.services.report_service import ( + create_report_from_analysis, + demo_report, + get_report, + get_report_content, + list_reports, +) from fastapi import APIRouter, Response router = APIRouter(prefix="/reports", tags=["reports"]) + +@router.post("/from-analysis") +def from_analysis(payload: ReportFromAnalysisRequest) -> dict[str, object]: + return create_report_from_analysis(payload.analysis, payload.title, payload.formats) + + +@router.get("") +def reports() -> list[dict[str, object]]: + return list_reports() + + +@router.get("/") +def reports_slash() -> list[dict[str, object]]: + return list_reports() + + +@router.get("/{report_id}") +def report_detail(report_id: str) -> dict[str, object]: + report = get_report(report_id) + return report or {"detail": "not found"} + + +@router.get("/{report_id}/download") +def report_download(report_id: str, format: str = "markdown") -> Response: + content = get_report_content(report_id, format) + if content is None: + return Response(content="Report or format not found", media_type="text/plain", status_code=404) + body, media_type, filename = content + return Response(content=body, media_type=media_type, headers={"content-disposition": f'attachment; filename="{filename}"'}) + + @router.get("/demo.md") def report_demo() -> Response: return Response(content=demo_report(), media_type="text/markdown") diff --git a/backend/app/api/routes_review.py b/backend/app/api/routes_review.py index 6630ba9..661e01a 100644 --- a/backend/app/api/routes_review.py +++ b/backend/app/api/routes_review.py @@ -1,10 +1,34 @@ +from __future__ import annotations + from argument_risk_engine.review.models import ReviewFeedback -from backend.app.services.review_service import record_feedback +from backend.app.schemas.review import ReviewItemRequest +from backend.app.services.review_service import ( + create_review_item, + get_review_summary, + list_review_items, + record_feedback, +) from fastapi import APIRouter router = APIRouter(prefix="/review", tags=["review"]) + +@router.get("/items") +def items() -> list[dict[str, object]]: + return list_review_items() + + +@router.post("/items") +def create_item(payload: ReviewItemRequest) -> dict[str, object]: + return create_review_item(payload) + + +@router.get("/summary") +def summary() -> dict[str, object]: + return get_review_summary() + + @router.post("/feedback") def feedback(payload: ReviewFeedback) -> dict[str, str]: return record_feedback(payload) diff --git a/backend/app/main.py b/backend/app/main.py index b485a69..6986d87 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -23,6 +23,9 @@ ) app.include_router(routes_analysis.router) +app.include_router(routes_review.router) +app.include_router(routes_evaluation.router) +app.include_router(routes_reports.router) app.include_router(routes_analysis.router, prefix="/api") app.include_router(routes_taxonomy.router, prefix="/api") app.include_router(routes_taxonomy_workbench.router, prefix="/api") diff --git a/backend/app/schemas/evaluation.py b/backend/app/schemas/evaluation.py index 7d63e3c..fb16f83 100644 --- a/backend/app/schemas/evaluation.py +++ b/backend/app/schemas/evaluation.py @@ -1,5 +1,20 @@ +from __future__ import annotations + +from typing import Any + from pydantic import BaseModel +class EvaluationRunRequest(BaseModel): + benchmark_path: str | None = None + + class EvaluationResponse(BaseModel): items: int + metrics: dict[str, float] + errors: dict[str, list[dict[str, Any]]] + false_positives: list[dict[str, Any]] + false_negatives: list[dict[str, Any]] + evidence_span_misses: list[dict[str, Any]] + analyses: list[dict[str, Any]] + disclaimer: str diff --git a/backend/app/schemas/reports.py b/backend/app/schemas/reports.py index 42acef1..02ef79e 100644 --- a/backend/app/schemas/reports.py +++ b/backend/app/schemas/reports.py @@ -1,5 +1,30 @@ -from pydantic import BaseModel +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + + +class ReportFromAnalysisRequest(BaseModel): + analysis: dict[str, Any] + title: str = "Argument Risk Report" + formats: list[str] = Field(default_factory=lambda: ["json", "markdown", "html"]) + + +class ReportSummary(BaseModel): + report_id: str + title: str + created_at: str + analysis_id: str + formats: list[str] class ReportResponse(BaseModel): - content: str + report_id: str + title: str + created_at: str + analysis_id: str + formats: list[str] + json: str | None = None + markdown: str | None = None + html: str | None = None diff --git a/backend/app/schemas/review.py b/backend/app/schemas/review.py index e69de29..3564ed1 100644 --- a/backend/app/schemas/review.py +++ b/backend/app/schemas/review.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + + +class ReviewItemRequest(BaseModel): + review_id: str | None = None + text_id: str + claim_id: str + claim_text: str + predicted_risks: list[dict[str, Any]] = Field(default_factory=list) + reviewer_decision: str + corrected_labels: list[str] = Field(default_factory=list) + corrected_evidence_spans: list[str] = Field(default_factory=list) + reviewer_notes: str = "" + + +class ReviewItemResponse(BaseModel): + review_id: str + text_id: str + claim_id: str + claim_text: str + predicted_risks: list[dict[str, Any]] + reviewer_decision: str + corrected_labels: list[str] + corrected_evidence_spans: list[str] + reviewer_notes: str + created_at: str + + +class ReviewSummaryResponse(BaseModel): + total_reviews: int + by_decision: dict[str, int] + corrected_label_counts: dict[str, int] + store_path: str diff --git a/backend/app/services/evaluation_service.py b/backend/app/services/evaluation_service.py index 2fdd90a..619ff73 100644 --- a/backend/app/services/evaluation_service.py +++ b/backend/app/services/evaluation_service.py @@ -1,7 +1,40 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + from argument_risk_engine.evaluation.runner import run_evaluation from backend.app.core.paths import DATA_DIR +BENCHMARK_PATH = DATA_DIR / "benchmarks" / "mini_eval_set.jsonl" +EVALUATION_RESULT_PATH = DATA_DIR / "evaluation" / "last_evaluation.json" + + +def evaluate(benchmark_path: str | None = None) -> dict[str, Any]: + path = Path(benchmark_path) if benchmark_path else BENCHMARK_PATH + result = run_evaluation(path) + EVALUATION_RESULT_PATH.parent.mkdir(parents=True, exist_ok=True) + EVALUATION_RESULT_PATH.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8") + return result + + +def evaluation_summary() -> dict[str, Any]: + result = _load_or_run() + return { + "items": result.get("items", 0), + "metrics": result.get("metrics", {}), + "disclaimer": result.get("disclaimer", ""), + } + + +def evaluation_errors() -> dict[str, Any]: + result = _load_or_run() + return result.get("errors", {"false_positives": [], "false_negatives": [], "evidence_span_misses": []}) + -def evaluate() -> dict[str, object]: - return run_evaluation(DATA_DIR / "benchmarks" / "mini_eval_set.jsonl") +def _load_or_run() -> dict[str, Any]: + if EVALUATION_RESULT_PATH.exists(): + return json.loads(EVALUATION_RESULT_PATH.read_text(encoding="utf-8")) + return evaluate() diff --git a/backend/app/services/report_service.py b/backend/app/services/report_service.py index 3e685a6..5b568f8 100644 --- a/backend/app/services/report_service.py +++ b/backend/app/services/report_service.py @@ -1,7 +1,89 @@ +from __future__ import annotations + +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any +from uuid import uuid4 + +from argument_risk_engine.reports.html import render_html_report +from argument_risk_engine.reports.json_export import render_json_report from argument_risk_engine.reports.markdown import render_markdown_report +from backend.app.core.paths import REPORTS_DIR from backend.app.services.analyzer_service import analyze +INDEX_PATH = REPORTS_DIR / "reports_index.json" +SUPPORTED_FORMATS = {"json", "markdown", "html"} + + +def create_report_from_analysis(analysis: dict[str, Any], title: str = "Argument Risk Report", formats: list[str] | None = None) -> dict[str, Any]: + requested = [fmt for fmt in (formats or ["json", "markdown", "html"]) if fmt in SUPPORTED_FORMATS] + if not requested: + requested = ["json"] + report_id = f"rpt_{uuid4().hex[:12]}" + created_at = datetime.now(timezone.utc).isoformat() + analysis_id = str(analysis.get("analysis_id") or analysis.get("text_id") or "unknown") + payload: dict[str, Any] = { + "report_id": report_id, + "title": title, + "created_at": created_at, + "analysis_id": analysis_id, + "formats": requested, + } + if "json" in requested: + payload["json"] = render_json_report(analysis) + if "markdown" in requested: + payload["markdown"] = render_markdown_report(analysis) + if "html" in requested: + payload["html"] = render_html_report(analysis) + _write_report(payload) + _append_index({key: payload[key] for key in ("report_id", "title", "created_at", "analysis_id", "formats")}) + return payload + + +def list_reports() -> list[dict[str, Any]]: + return _read_index() + + +def get_report(report_id: str) -> dict[str, Any] | None: + path = _report_path(report_id) + if not path.exists(): + return None + return json.loads(path.read_text(encoding="utf-8")) + + +def get_report_content(report_id: str, report_format: str) -> tuple[str, str, str] | None: + report = get_report(report_id) + if not report or report_format not in SUPPORTED_FORMATS or not report.get(report_format): + return None + extension = "md" if report_format == "markdown" else report_format + media_type = {"json": "application/json", "markdown": "text/markdown", "html": "text/html"}[report_format] + return str(report[report_format]), media_type, f"{report_id}.{extension}" + def demo_report() -> str: return render_markdown_report(analyze("Everyone always caused this problem.")) + + +def _write_report(report: dict[str, Any]) -> None: + REPORTS_DIR.mkdir(parents=True, exist_ok=True) + _report_path(str(report["report_id"])).write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8") + + +def _report_path(report_id: str) -> Path: + safe = "".join(char for char in report_id if char.isalnum() or char in {"_", "-"}) + return REPORTS_DIR / f"{safe}.json" + + +def _read_index() -> list[dict[str, Any]]: + if not INDEX_PATH.exists(): + return [] + return json.loads(INDEX_PATH.read_text(encoding="utf-8")) + + +def _append_index(summary: dict[str, Any]) -> None: + REPORTS_DIR.mkdir(parents=True, exist_ok=True) + records = [item for item in _read_index() if item.get("report_id") != summary["report_id"]] + records.insert(0, summary) + INDEX_PATH.write_text(json.dumps(records, indent=2, ensure_ascii=False), encoding="utf-8") diff --git a/backend/app/services/review_service.py b/backend/app/services/review_service.py index 387f6ee..b466264 100644 --- a/backend/app/services/review_service.py +++ b/backend/app/services/review_service.py @@ -1,7 +1,34 @@ -from argument_risk_engine.review.models import ReviewFeedback -from argument_risk_engine.review.store import append_feedback +from __future__ import annotations + +from argument_risk_engine.review.models import ReviewFeedback, ReviewItem +from argument_risk_engine.review.store import ( + append_feedback, + append_review_item, + read_review_items, + review_summary, +) from backend.app.core.paths import REVIEW_STORE_PATH +from backend.app.schemas.review import ReviewItemRequest + + +def list_review_items() -> list[dict[str, object]]: + return [item.model_dump() for item in read_review_items(REVIEW_STORE_PATH)] + + +def create_review_item(payload: ReviewItemRequest | ReviewItem) -> dict[str, object]: + if isinstance(payload, ReviewItem): + item = payload + else: + data = payload.model_dump() + if not data.get("review_id"): + data.pop("review_id", None) + item = ReviewItem(**data) + return append_review_item(REVIEW_STORE_PATH, item).model_dump() + + +def get_review_summary() -> dict[str, object]: + return review_summary(REVIEW_STORE_PATH) def record_feedback(feedback: ReviewFeedback) -> dict[str, str]: diff --git a/data/benchmarks/mini_eval_set.jsonl b/data/benchmarks/mini_eval_set.jsonl index 93c1695..93f28e1 100644 --- a/data/benchmarks/mini_eval_set.jsonl +++ b/data/benchmarks/mini_eval_set.jsonl @@ -1 +1,2 @@ -{"text":"They are vermin.","expected":["dehumanizing_language"]} +{"id":"sample_001","text":"They are vermin.","gold_labels":["dehumanizing_language"],"gold_evidence_spans":["vermin"],"difficulty":"easy","notes":"Simple dehumanizing-language smoke test for MVP evaluation."} +{"id":"sample_002","text":"This proposal has tradeoffs and the evidence is mixed.","gold_labels":[],"gold_evidence_spans":[],"difficulty":"easy","notes":"Neutral reasoning sample used to monitor over-classification."} diff --git a/engine/argument_risk_engine/evaluation/metrics.py b/engine/argument_risk_engine/evaluation/metrics.py index e498aa4..b6acb14 100644 --- a/engine/argument_risk_engine/evaluation/metrics.py +++ b/engine/argument_risk_engine/evaluation/metrics.py @@ -1,2 +1,84 @@ +from __future__ import annotations + +from typing import Any + + def precision(tp: int, fp: int) -> float: return tp / (tp + fp) if tp + fp else 0.0 + + +def recall(tp: int, fn: int) -> float: + return tp / (tp + fn) if tp + fn else 0.0 + + +def f1_score(label_precision: float, label_recall: float) -> float: + return 2 * label_precision * label_recall / (label_precision + label_recall) if label_precision + label_recall else 0.0 + + +def partial_span_match(predicted: str, gold: str) -> bool: + predicted_norm = predicted.strip().lower() + gold_norm = gold.strip().lower() + return bool(predicted_norm and gold_norm and (predicted_norm in gold_norm or gold_norm in predicted_norm)) + + +def compute_metrics(rows: list[dict[str, Any]], analyses: list[dict[str, Any]]) -> dict[str, float]: + tp = fp = fn = 0 + exact_matches = partial_matches = span_cases = 0 + review_count = over_classified = no_finding = 0 + + for row, analysis in zip(rows, analyses, strict=False): + gold_labels = set(_gold_labels(row)) + predicted_labels = set(_predicted_labels(analysis)) + tp += len(predicted_labels & gold_labels) + fp += len(predicted_labels - gold_labels) + fn += len(gold_labels - predicted_labels) + + if analysis.get("needs_human_review"): + review_count += 1 + if len(predicted_labels) > len(gold_labels): + over_classified += 1 + if not predicted_labels: + no_finding += 1 + + gold_spans = _gold_spans(row) + predicted_spans = _predicted_spans(analysis) + if gold_spans: + span_cases += 1 + if any(pred == gold for pred in predicted_spans for gold in gold_spans): + exact_matches += 1 + if any(partial_span_match(pred, gold) for pred in predicted_spans for gold in gold_spans): + partial_matches += 1 + + label_precision = precision(tp, fp) + label_recall = recall(tp, fn) + total = len(rows) + return { + "label_precision": round(label_precision, 4), + "label_recall": round(label_recall, 4), + "label_f1": round(f1_score(label_precision, label_recall), 4), + "false_positive_rate": round(fp / (fp + tp) if fp + tp else 0.0, 4), + "evidence_span_exact_match": round(exact_matches / span_cases if span_cases else 0.0, 4), + "evidence_span_partial_match": round(partial_matches / span_cases if span_cases else 0.0, 4), + "human_review_rate": round(review_count / total if total else 0.0, 4), + "over_classification_rate": round(over_classified / total if total else 0.0, 4), + "no_finding_rate": round(no_finding / total if total else 0.0, 4), + } + + +def _gold_labels(row: dict[str, Any]) -> list[str]: + return [str(label) for label in row.get("gold_labels", row.get("expected", []))] + + +def _gold_spans(row: dict[str, Any]) -> list[str]: + return [str(span) for span in row.get("gold_evidence_spans", [])] + + +def _predicted_labels(analysis: dict[str, Any]) -> list[str]: + labels: list[str] = [] + for risk in analysis.get("risks", []): + labels.append(str(risk.get("risk_id") or risk.get("label") or "")) + return [label for label in labels if label] + + +def _predicted_spans(analysis: dict[str, Any]) -> list[str]: + return [str(risk.get("evidence_span") or "") for risk in analysis.get("risks", []) if risk.get("evidence_span")] diff --git a/engine/argument_risk_engine/evaluation/runner.py b/engine/argument_risk_engine/evaluation/runner.py index 84290e4..43dccb7 100644 --- a/engine/argument_risk_engine/evaluation/runner.py +++ b/engine/argument_risk_engine/evaluation/runner.py @@ -2,14 +2,60 @@ import json from pathlib import Path +from typing import Any from argument_risk_engine.analyzer import analyze_text +from argument_risk_engine.evaluation.metrics import compute_metrics +DISCLAIMER = ( + "MVP evaluation metrics are operational QA indicators for this benchmark only; " + "they do not establish scientific or clinical validation." +) -def run_evaluation(path: Path | str) -> dict[str, object]: - rows = [] - p = Path(path) - if p.exists(): - rows = [json.loads(line) for line in p.read_text().splitlines() if line.strip()] + +def run_evaluation(path: Path | str) -> dict[str, Any]: + rows = load_benchmark(path) analyses = [analyze_text(row.get("text", "")) for row in rows] - return {"items": len(rows), "analyses": analyses} + metrics = compute_metrics(rows, analyses) + errors = collect_errors(rows, analyses) + return { + "items": len(rows), + "metrics": metrics, + "errors": errors, + "false_positives": errors["false_positives"], + "false_negatives": errors["false_negatives"], + "evidence_span_misses": errors["evidence_span_misses"], + "analyses": analyses, + "disclaimer": DISCLAIMER, + } + + +def load_benchmark(path: Path | str) -> list[dict[str, Any]]: + p = Path(path) + if not p.exists(): + return [] + return [json.loads(line) for line in p.read_text(encoding="utf-8").splitlines() if line.strip()] + + +def collect_errors(rows: list[dict[str, Any]], analyses: list[dict[str, Any]]) -> dict[str, list[dict[str, Any]]]: + false_positives: list[dict[str, Any]] = [] + false_negatives: list[dict[str, Any]] = [] + evidence_span_misses: list[dict[str, Any]] = [] + for row, analysis in zip(rows, analyses, strict=False): + row_id = str(row.get("id") or row.get("text_id") or "unknown") + gold_labels = {str(label) for label in row.get("gold_labels", row.get("expected", []))} + predicted_labels = {str(risk.get("risk_id") or risk.get("label") or "") for risk in analysis.get("risks", []) if risk} + for label in sorted(predicted_labels - gold_labels): + false_positives.append({"id": row_id, "label": label, "text": row.get("text", "")}) + for label in sorted(gold_labels - predicted_labels): + false_negatives.append({"id": row_id, "label": label, "text": row.get("text", "")}) + gold_spans = [str(span) for span in row.get("gold_evidence_spans", [])] + if gold_spans: + predicted_spans = [str(risk.get("evidence_span") or "") for risk in analysis.get("risks", [])] + if not any(pred == gold for pred in predicted_spans for gold in gold_spans): + evidence_span_misses.append({"id": row_id, "gold_spans": gold_spans, "predicted_spans": predicted_spans}) + return { + "false_positives": false_positives, + "false_negatives": false_negatives, + "evidence_span_misses": evidence_span_misses, + } diff --git a/engine/argument_risk_engine/reports/html.py b/engine/argument_risk_engine/reports/html.py index f9b1764..c64906d 100644 --- a/engine/argument_risk_engine/reports/html.py +++ b/engine/argument_risk_engine/reports/html.py @@ -1,2 +1,45 @@ -def render_html_report(result): - return "
No risk findings were detected by the current analysis configuration.
" + return f""" + +Analysis ID: {escape(str(result.get('analysis_id') or result.get('text_id', 'unknown')))}
+Overall risk score: {escape(str(result.get('overall_risk_score', 0)))}
+Risk level: {escape(str(result.get('risk_level', 'unknown')))}
+Metrics and reports are review aids only and do not claim scientific validation.
+{escape(str(risk.get('risk_id', '')))}{escape(str(risk.get('explanation', '')))}
+