diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index d47b6d3b1..365357c01 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -31,8 +31,10 @@ import argparse import contextlib import json +import logging import os import platform +import re import shutil import subprocess import sys @@ -69,6 +71,8 @@ ) +logger = logging.getLogger(__name__) + # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- @@ -233,6 +237,82 @@ def _utc_now() -> str: return datetime.now(timezone.utc).isoformat() +def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None: + """Return combined size in bytes of all ONNX files + their external data companions. + + Parses the ONNX proto to discover all referenced external data files (not just + the conventional `.data` suffix). Falls back to the `.data` companion heuristic + if proto parsing is unavailable. + + Returns None if onnx_paths is empty or no files exist on disk. + """ + if not onnx_paths: + return None + total = 0 + found_any = False + for path_str in onnx_paths.values(): + p = Path(path_str) + if not p.exists(): + continue + total += p.stat().st_size + found_any = True + # Try to enumerate all external data files from the proto + try: + from winml.modelkit.onnx.external_data import get_external_data_files + + ext_files = get_external_data_files(p) + for ext_name in ext_files: + ext_path = p.parent / ext_name + if ext_path.exists(): + total += ext_path.stat().st_size + except Exception: + # Fallback: check conventional .data companion + data_p = p.with_suffix(p.suffix + ".data") + if data_p.exists(): + total += data_p.stat().st_size + return total if found_any else None + + +# Lines that carry no diagnostic value in eval_result.json. +# Matching is case-insensitive, anchored at line start. +_NOISE_PATTERNS = ( + "benchmarking onnx", + "device:", + "task:", + "latency (ms)", + "throughput:", + "results saved to", + "inputs:", + "outputs:", + "samples/sec", +) +_NOISE_RE = re.compile("|".join(re.escape(p) for p in _NOISE_PATTERNS), re.IGNORECASE) + +# Box-drawing characters used by Rich tables. +_BOX_CHARS = frozenset("─│┌┐└┘├┤┬┴┼") + + +def _sanitize_output(text: str) -> str: + """Strip routine CLI chrome from subprocess output, keeping error content. + + Removes Rich benchmark tables, device/IO banners, and path lines that + bloat eval_result.json without aiding failure diagnosis. All classifier + patterns (see classifier.py) are error-related and survive this filter. + """ + kept: list[str] = [] + for line in text.splitlines(): + stripped = line.strip() + if not stripped: + continue + # Drop box-drawing table rows + if stripped[0] in _BOX_CHARS: + continue + if _NOISE_RE.match(stripped): + continue + kept.append(stripped) + return "\n".join(kept) + + def _kill_process_tree(pid: int) -> None: """Kill a process and all its children. @@ -1028,6 +1108,20 @@ def save_environment_info(path: Path) -> None: except (subprocess.TimeoutExpired, FileNotFoundError): pass # git not available or timed out; commit info stays empty + # `winml sys --format json` captures hardware details (devices, EPs, + # backends) that the lightweight package-version probes above miss. + try: + result = subprocess.run( # noqa: S603 + [sys.executable, "-m", "winml", "sys", "--format", "json"], + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode == 0: + info["winml_sys"] = json.loads(result.stdout) + except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError) as exc: + logger.debug("winml sys skipped: %s", exc) + path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(info, indent=2), encoding="utf-8") @@ -1139,6 +1233,11 @@ def parse_args() -> argparse.Namespace: help="Skip report generation (useful when running per-model in a pipeline loop)", ) parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") + parser.add_argument( + "--raw-output", + action="store_true", + help="Keep raw subprocess output in eval_result.json without sanitization", + ) parser.add_argument( "--continue", dest="continue_run", @@ -1399,6 +1498,7 @@ def main() -> None: ep=args.ep, ) onnx_paths = build_result["onnx_paths"] if build_result["success"] else {} + onnx_size = _compute_onnx_size(onnx_paths) if not build_result["success"]: # Build failed — synthesize failed result for downstream phases @@ -1443,7 +1543,14 @@ def main() -> None: break result = build_eval_result( - entry, perf_proc, args.device, eval_types_run, accuracy_result, ep=args.ep + entry, + perf_proc, + args.device, + eval_types_run, + accuracy_result, + ep=args.ep, + onnx_size_bytes=onnx_size, + sanitize_fn=None if args.raw_output else _sanitize_output, ) results.append(result) diff --git a/scripts/e2e_eval/utils/reporter.py b/scripts/e2e_eval/utils/reporter.py index a97fef69b..5da4f75ad 100644 --- a/scripts/e2e_eval/utils/reporter.py +++ b/scripts/e2e_eval/utils/reporter.py @@ -23,6 +23,8 @@ if TYPE_CHECKING: + from collections.abc import Callable + from .registry import ModelEntry @@ -38,6 +40,8 @@ def build_eval_result( eval_types_run: list[str], accuracy_result: dict | None = None, ep: str | None = None, + onnx_size_bytes: int | None = None, + sanitize_fn: Callable[[str], str] | None = None, ) -> dict: """Build a unified eval_result dict (facts only, no derived fields). @@ -46,16 +50,28 @@ def build_eval_result( accuracy_result is the accuracy sub-section dict (or None if not run). ep is the explicit execution provider (e.g., "qnn", "dml"), or None when not specified (device-to-provider mapping was used). + onnx_size_bytes is the combined size of the exported ONNX + .data files. + sanitize_fn, when provided, is applied to stdout/stderr to remove noise. """ perf_section: dict | None = None if perf_proc is not None: passed = perf_proc["exit_code"] == 0 + raw_stdout = perf_proc["stdout"] + raw_stderr = perf_proc["stderr"] + if sanitize_fn is not None: + stdout = sanitize_fn(raw_stdout) + stderr = sanitize_fn(raw_stderr) + else: + stdout = raw_stdout + stderr = raw_stderr perf_section = { "passed": passed, "elapsed": perf_proc["elapsed"], "exit_code": perf_proc["exit_code"], - "stdout_output": perf_proc["stdout"], - "stderr_output": perf_proc["stderr"], + "stdout_output": stdout, + "stderr_output": stderr, + "raw_stdout": raw_stdout, + "raw_stderr": raw_stderr, "timeout": perf_proc["timeout"], "command": perf_proc["command"], "error": perf_proc.get("error_summary", ""), @@ -76,6 +92,8 @@ def build_eval_result( "accuracy": accuracy_result, } # Optional fields: only include when explicitly provided by the user. + if onnx_size_bytes is not None: + result["onnx_size_bytes"] = onnx_size_bytes if ep is not None: result["ep"] = ep return result @@ -323,8 +341,9 @@ def generate_html_report( output_path: Path, registry_path: Path | None = None, ) -> None: - from .accuracy import format_delta """Generate interactive HTML report with Perf and Accuracy tabs.""" + from .accuracy import format_delta + results = report_data.get("results", []) # Load registry for enrichment @@ -366,9 +385,7 @@ def generate_html_report( if acc is not None else None ), - "delta_display": ( - format_delta(acc) if acc and not acc.get("skipped") else "" - ), + "delta_display": (format_delta(acc) if acc and not acc.get("skipped") else ""), "metric": ( { "name": (acc.get("winml_metric") or {}).get("metric"), diff --git a/tests/unit/eval/test_eval.py b/tests/unit/eval/test_eval.py index 568df6dbb..86b5426ff 100644 --- a/tests/unit/eval/test_eval.py +++ b/tests/unit/eval/test_eval.py @@ -110,9 +110,7 @@ def test_feature_extraction_mapped_to_hf_image_feature_extraction_for_vision_mod fake_onnx_config = MagicMock() fake_onnx_config.inputs = {"pixel_values": object()} - config = WinMLEvaluationConfig( - model_id="facebook/dinov2-base", task="feature-extraction" - ) + config = WinMLEvaluationConfig(model_id="facebook/dinov2-base", task="feature-extraction") with ( patch( "transformers.AutoConfig.from_pretrained", @@ -1158,6 +1156,45 @@ def test_ep_present_when_provided(self): ) assert result["ep"] == "qnn" + def test_sanitize_fn_preserves_raw_perf_output(self): + reporter = self._load_reporter() + + perf_proc = { + "exit_code": 0, + "stdout": "Latency (ms): 12.5\nThroughput: 80 samples/sec\nsome error line", + "stderr": "warning: device busy", + "elapsed": 5.0, + "timeout": False, + "command": "winml perf", + "timestamp": "2026-01-01T00:00:00+00:00", + } + + def strip_perf(text: str) -> str: + return "\n".join( + line + for line in text.splitlines() + if "latency" not in line.lower() and "throughput" not in line.lower() + ) + + result = reporter.build_eval_result( + entry=self._make_entry(), + perf_proc=perf_proc, + device="cpu", + eval_types_run=["perf"], + accuracy_result=None, + ep=None, + sanitize_fn=strip_perf, + ) + + perf = result["perf"] + # sanitized output should not contain latency/throughput lines + assert "Latency" not in perf["stdout_output"] + assert "Throughput" not in perf["stdout_output"] + # raw output preserves the original perf data + assert "Latency (ms): 12.5" in perf["raw_stdout"] + assert "Throughput: 80 samples/sec" in perf["raw_stdout"] + assert perf["raw_stderr"] == "warning: device busy" + class TestDefaultDatasetImmutability: """Tests that module-level _DEFAULT_DATASETS are not corrupted."""