From 0b3eaac0903039208f1234fb7d429fe5d9f61004 Mon Sep 17 00:00:00 2001 From: Qiong Wu Date: Tue, 26 May 2026 17:36:04 +0800 Subject: [PATCH 1/7] update eval scripts: add ONNX size tracking and output sanitization - Add _compute_onnx_size() to measure combined ONNX + .data file sizes - Add _sanitize_output() to strip CLI chrome (Rich tables, banners) from eval_result.json, keeping only error-relevant content - Pass onnx_size_bytes and sanitize_fn through to build_eval_result() - Minor formatting fixes in reporter.py --- scripts/e2e_eval/run_eval.py | 70 +++++++++++++++++++++++++++++- scripts/e2e_eval/utils/reporter.py | 22 +++++++--- 2 files changed, 86 insertions(+), 6 deletions(-) diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index ae50351d1..f27165d98 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -233,6 +233,66 @@ def _utc_now() -> str: return datetime.now(timezone.utc).isoformat() +def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None: + """Return combined size in bytes of all ONNX files + their .data companions. + + Returns None if onnx_paths is empty or no files exist on disk. + """ + if not onnx_paths: + return None + total = 0 + found_any = False + for path_str in onnx_paths.values(): + p = Path(path_str) + if p.exists(): + total += p.stat().st_size + found_any = True + data_p = Path(path_str + ".data") + if data_p.exists(): + total += data_p.stat().st_size + return total if found_any else None + + +# Lines that carry no diagnostic value in eval_result.json. +# Matching is case-insensitive, applied per-line. +_NOISE_PATTERNS = ( + "benchmarking onnx", + "device:", + "task:", + "latency (ms)", + "throughput:", + "results saved to", + "inputs:", + "outputs:", + "samples/sec", +) + +# Box-drawing characters used by Rich tables. +_BOX_CHARS = frozenset("─│┌┐└┘├┤┬┴┼") + + +def _sanitize_output(text: str) -> str: + """Strip routine CLI chrome from subprocess output, keeping error content. + + Removes Rich benchmark tables, device/IO banners, and path lines that + bloat eval_result.json without aiding failure diagnosis. All classifier + patterns (see classifier.py) are error-related and survive this filter. + """ + kept: list[str] = [] + for line in text.splitlines(): + stripped = line.strip() + if not stripped: + continue + # Drop box-drawing table rows + if stripped[0] in _BOX_CHARS: + continue + low = stripped.lower() + if any(pat in low for pat in _NOISE_PATTERNS): + continue + kept.append(stripped) + return "\n".join(kept) + + def _kill_process_tree(pid: int) -> None: """Kill a process and all its children. @@ -1395,6 +1455,7 @@ def main() -> None: ep=args.ep, ) onnx_paths = build_result["onnx_paths"] if build_result["success"] else {} + onnx_size = _compute_onnx_size(onnx_paths) if not build_result["success"]: # Build failed — synthesize failed result for downstream phases @@ -1439,7 +1500,14 @@ def main() -> None: break result = build_eval_result( - entry, perf_proc, args.device, eval_types_run, accuracy_result, ep=args.ep + entry, + perf_proc, + args.device, + eval_types_run, + accuracy_result, + ep=args.ep, + onnx_size_bytes=onnx_size, + sanitize_fn=_sanitize_output, ) results.append(result) diff --git a/scripts/e2e_eval/utils/reporter.py b/scripts/e2e_eval/utils/reporter.py index a97fef69b..0a16a51fe 100644 --- a/scripts/e2e_eval/utils/reporter.py +++ b/scripts/e2e_eval/utils/reporter.py @@ -23,6 +23,8 @@ if TYPE_CHECKING: + from collections.abc import Callable + from .registry import ModelEntry @@ -38,6 +40,8 @@ def build_eval_result( eval_types_run: list[str], accuracy_result: dict | None = None, ep: str | None = None, + onnx_size_bytes: int | None = None, + sanitize_fn: Callable[[str], str] | None = None, ) -> dict: """Build a unified eval_result dict (facts only, no derived fields). @@ -46,16 +50,23 @@ def build_eval_result( accuracy_result is the accuracy sub-section dict (or None if not run). ep is the explicit execution provider (e.g., "qnn", "dml"), or None when not specified (device-to-provider mapping was used). + onnx_size_bytes is the combined size of the exported ONNX + .data files. + sanitize_fn, when provided, is applied to stdout/stderr to remove noise. """ perf_section: dict | None = None if perf_proc is not None: passed = perf_proc["exit_code"] == 0 + stdout = perf_proc["stdout"] + stderr = perf_proc["stderr"] + if sanitize_fn is not None: + stdout = sanitize_fn(stdout) + stderr = sanitize_fn(stderr) perf_section = { "passed": passed, "elapsed": perf_proc["elapsed"], "exit_code": perf_proc["exit_code"], - "stdout_output": perf_proc["stdout"], - "stderr_output": perf_proc["stderr"], + "stdout_output": stdout, + "stderr_output": stderr, "timeout": perf_proc["timeout"], "command": perf_proc["command"], "error": perf_proc.get("error_summary", ""), @@ -76,6 +87,8 @@ def build_eval_result( "accuracy": accuracy_result, } # Optional fields: only include when explicitly provided by the user. + if onnx_size_bytes is not None: + result["onnx_size_bytes"] = onnx_size_bytes if ep is not None: result["ep"] = ep return result @@ -324,6 +337,7 @@ def generate_html_report( registry_path: Path | None = None, ) -> None: from .accuracy import format_delta + """Generate interactive HTML report with Perf and Accuracy tabs.""" results = report_data.get("results", []) @@ -366,9 +380,7 @@ def generate_html_report( if acc is not None else None ), - "delta_display": ( - format_delta(acc) if acc and not acc.get("skipped") else "" - ), + "delta_display": (format_delta(acc) if acc and not acc.get("skipped") else ""), "metric": ( { "name": (acc.get("winml_metric") or {}).get("metric"), From 9c152a0d957664a18b4ec3fa3edfa902b5aaf2de Mon Sep 17 00:00:00 2001 From: Qiong Wu Date: Tue, 26 May 2026 17:40:21 +0800 Subject: [PATCH 2/7] add winml sys output to environment.json Capture hardware details (devices, EPs, backends) by running `winml sys --format json` and embedding the result under the `winml_sys` key in environment.json. --- scripts/e2e_eval/run_eval.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index f27165d98..599140d3e 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -1084,6 +1084,20 @@ def save_environment_info(path: Path) -> None: except (subprocess.TimeoutExpired, FileNotFoundError): pass # git not available or timed out; commit info stays empty + # `winml sys --format json` captures hardware details (devices, EPs, + # backends) that the lightweight package-version probes above miss. + try: + result = subprocess.run( # noqa: S603 + [sys.executable, "-m", "winml", "sys", "--format", "json"], + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode == 0: + info["winml_sys"] = json.loads(result.stdout) + except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError): + pass + path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(info, indent=2), encoding="utf-8") From 7d00b37f09a44933ca618767b51627fcf8971315 Mon Sep 17 00:00:00 2001 From: Qiong Wu Date: Fri, 29 May 2026 16:32:03 +0800 Subject: [PATCH 3/7] fix: preserve raw perf data before output sanitization The sanitize_fn strips perf metric lines (latency, throughput, etc.) from stdout/stderr. Store the original output in raw_stdout/raw_stderr fields so downstream consumers can still access the full perf data. --- scripts/e2e_eval/utils/reporter.py | 13 +++++++---- tests/unit/eval/test_eval.py | 37 ++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/scripts/e2e_eval/utils/reporter.py b/scripts/e2e_eval/utils/reporter.py index 0a16a51fe..2cd95367d 100644 --- a/scripts/e2e_eval/utils/reporter.py +++ b/scripts/e2e_eval/utils/reporter.py @@ -56,17 +56,22 @@ def build_eval_result( perf_section: dict | None = None if perf_proc is not None: passed = perf_proc["exit_code"] == 0 - stdout = perf_proc["stdout"] - stderr = perf_proc["stderr"] + raw_stdout = perf_proc["stdout"] + raw_stderr = perf_proc["stderr"] if sanitize_fn is not None: - stdout = sanitize_fn(stdout) - stderr = sanitize_fn(stderr) + stdout = sanitize_fn(raw_stdout) + stderr = sanitize_fn(raw_stderr) + else: + stdout = raw_stdout + stderr = raw_stderr perf_section = { "passed": passed, "elapsed": perf_proc["elapsed"], "exit_code": perf_proc["exit_code"], "stdout_output": stdout, "stderr_output": stderr, + "raw_stdout": raw_stdout, + "raw_stderr": raw_stderr, "timeout": perf_proc["timeout"], "command": perf_proc["command"], "error": perf_proc.get("error_summary", ""), diff --git a/tests/unit/eval/test_eval.py b/tests/unit/eval/test_eval.py index c7ca4af6d..427c6603e 100644 --- a/tests/unit/eval/test_eval.py +++ b/tests/unit/eval/test_eval.py @@ -1067,6 +1067,43 @@ def test_ep_present_when_provided(self): ) assert result["ep"] == "qnn" + def test_sanitize_fn_preserves_raw_perf_output(self): + reporter = self._load_reporter() + + perf_proc = { + "exit_code": 0, + "stdout": "Latency (ms): 12.5\nThroughput: 80 samples/sec\nsome error line", + "stderr": "warning: device busy", + "elapsed": 5.0, + "timeout": False, + "command": "winml perf", + "timestamp": "2026-01-01T00:00:00+00:00", + } + + def strip_perf(text: str) -> str: + return "\n".join( + l for l in text.splitlines() if "latency" not in l.lower() and "throughput" not in l.lower() + ) + + result = reporter.build_eval_result( + entry=self._make_entry(), + perf_proc=perf_proc, + device="cpu", + eval_types_run=["perf"], + accuracy_result=None, + ep=None, + sanitize_fn=strip_perf, + ) + + perf = result["perf"] + # sanitized output should not contain latency/throughput lines + assert "Latency" not in perf["stdout_output"] + assert "Throughput" not in perf["stdout_output"] + # raw output preserves the original perf data + assert "Latency (ms): 12.5" in perf["raw_stdout"] + assert "Throughput: 80 samples/sec" in perf["raw_stdout"] + assert perf["raw_stderr"] == "warning: device busy" + class TestDefaultDatasetImmutability: """Tests that module-level _DEFAULT_DATASETS are not corrupted.""" From f21a12366e0891e1913dd15e6504fe1c03189dff Mon Sep 17 00:00:00 2001 From: Qiong Wu Date: Fri, 29 May 2026 16:33:44 +0800 Subject: [PATCH 4/7] fix: add explanatory comment to except clause (CodeQL) --- scripts/e2e_eval/run_eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index 599140d3e..555573977 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -1096,7 +1096,7 @@ def save_environment_info(path: Path) -> None: if result.returncode == 0: info["winml_sys"] = json.loads(result.stdout) except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError): - pass + pass # winml sys unavailable or returned invalid JSON; skip optional field path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(info, indent=2), encoding="utf-8") From b3e0cf9614c8ce504ccf82f978b9954aa9768c0d Mon Sep 17 00:00:00 2001 From: Qiong Wu Date: Tue, 2 Jun 2026 20:41:24 +0800 Subject: [PATCH 5/7] fix: address PR review comments - Fix displaced docstring in generate_html_report (was after import) - Anchor _sanitize_output patterns to line start to avoid stripping legitimate error messages containing pattern substrings - Use idiomatic pathlib for .data companion file construction --- scripts/e2e_eval/run_eval.py | 6 +++--- scripts/e2e_eval/utils/reporter.py | 2 +- tests/unit/eval/test_eval.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index 555573977..c1a7a206f 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -247,7 +247,7 @@ def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None: if p.exists(): total += p.stat().st_size found_any = True - data_p = Path(path_str + ".data") + data_p = p.with_suffix(p.suffix + ".data") if data_p.exists(): total += data_p.stat().st_size return total if found_any else None @@ -287,7 +287,7 @@ def _sanitize_output(text: str) -> str: if stripped[0] in _BOX_CHARS: continue low = stripped.lower() - if any(pat in low for pat in _NOISE_PATTERNS): + if any(low.startswith(pat) for pat in _NOISE_PATTERNS): continue kept.append(stripped) return "\n".join(kept) @@ -1087,7 +1087,7 @@ def save_environment_info(path: Path) -> None: # `winml sys --format json` captures hardware details (devices, EPs, # backends) that the lightweight package-version probes above miss. try: - result = subprocess.run( # noqa: S603 + result = subprocess.run( [sys.executable, "-m", "winml", "sys", "--format", "json"], capture_output=True, text=True, diff --git a/scripts/e2e_eval/utils/reporter.py b/scripts/e2e_eval/utils/reporter.py index 2cd95367d..5da4f75ad 100644 --- a/scripts/e2e_eval/utils/reporter.py +++ b/scripts/e2e_eval/utils/reporter.py @@ -341,9 +341,9 @@ def generate_html_report( output_path: Path, registry_path: Path | None = None, ) -> None: + """Generate interactive HTML report with Perf and Accuracy tabs.""" from .accuracy import format_delta - """Generate interactive HTML report with Perf and Accuracy tabs.""" results = report_data.get("results", []) # Load registry for enrichment diff --git a/tests/unit/eval/test_eval.py b/tests/unit/eval/test_eval.py index 427c6603e..e6c53021d 100644 --- a/tests/unit/eval/test_eval.py +++ b/tests/unit/eval/test_eval.py @@ -91,9 +91,7 @@ def test_feature_extraction_mapped_to_hf_image_feature_extraction_for_vision_mod fake_onnx_config = MagicMock() fake_onnx_config.inputs = {"pixel_values": object()} - config = WinMLEvaluationConfig( - model_id="facebook/dinov2-base", task="feature-extraction" - ) + config = WinMLEvaluationConfig(model_id="facebook/dinov2-base", task="feature-extraction") with ( patch( "transformers.AutoConfig.from_pretrained", @@ -1082,7 +1080,9 @@ def test_sanitize_fn_preserves_raw_perf_output(self): def strip_perf(text: str) -> str: return "\n".join( - l for l in text.splitlines() if "latency" not in l.lower() and "throughput" not in l.lower() + line + for line in text.splitlines() + if "latency" not in line.lower() and "throughput" not in line.lower() ) result = reporter.build_eval_result( From 9154eff238fbc98c4eb4f18b640aff4c2e88e314 Mon Sep 17 00:00:00 2001 From: Qiong Wu Date: Thu, 4 Jun 2026 10:34:00 +0800 Subject: [PATCH 6/7] fix: address PR review comments - Improve _compute_onnx_size to parse ONNX proto for all external data files instead of only checking the conventional .data suffix - Add debug logging when winml sys times out or fails (replaces bare pass) - Add --raw-output flag to skip output sanitization in eval_result.json --- scripts/e2e_eval/run_eval.py | 46 +++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index cf790f6b2..840c5b562 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -31,6 +31,7 @@ import argparse import contextlib import json +import logging import os import platform import shutil @@ -69,6 +70,8 @@ ) +logger = logging.getLogger(__name__) + # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- @@ -234,7 +237,11 @@ def _utc_now() -> str: def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None: - """Return combined size in bytes of all ONNX files + their .data companions. + """Return combined size in bytes of all ONNX files + their external data companions. + + Parses the ONNX proto to discover all referenced external data files (not just + the conventional `.data` suffix). Falls back to the `.data` companion heuristic + if proto parsing is unavailable. Returns None if onnx_paths is empty or no files exist on disk. """ @@ -244,12 +251,24 @@ def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None: found_any = False for path_str in onnx_paths.values(): p = Path(path_str) - if p.exists(): - total += p.stat().st_size - found_any = True - data_p = p.with_suffix(p.suffix + ".data") - if data_p.exists(): - total += data_p.stat().st_size + if not p.exists(): + continue + total += p.stat().st_size + found_any = True + # Try to enumerate all external data files from the proto + try: + from winml.modelkit.onnx.external_data import get_external_data_files + + ext_files = get_external_data_files(p) + for ext_name in ext_files: + ext_path = p.parent / ext_name + if ext_path.exists(): + total += ext_path.stat().st_size + except Exception: + # Fallback: check conventional .data companion + data_p = p.with_suffix(p.suffix + ".data") + if data_p.exists(): + total += data_p.stat().st_size return total if found_any else None @@ -1091,7 +1110,7 @@ def save_environment_info(path: Path) -> None: # `winml sys --format json` captures hardware details (devices, EPs, # backends) that the lightweight package-version probes above miss. try: - result = subprocess.run( + result = subprocess.run( # noqa: S603 [sys.executable, "-m", "winml", "sys", "--format", "json"], capture_output=True, text=True, @@ -1099,8 +1118,8 @@ def save_environment_info(path: Path) -> None: ) if result.returncode == 0: info["winml_sys"] = json.loads(result.stdout) - except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError): - pass # winml sys unavailable or returned invalid JSON; skip optional field + except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError) as exc: + logger.debug("winml sys skipped: %s", exc) path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(info, indent=2), encoding="utf-8") @@ -1213,6 +1232,11 @@ def parse_args() -> argparse.Namespace: help="Skip report generation (useful when running per-model in a pipeline loop)", ) parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") + parser.add_argument( + "--raw-output", + action="store_true", + help="Keep raw subprocess output in eval_result.json without sanitization", + ) parser.add_argument( "--continue", dest="continue_run", @@ -1525,7 +1549,7 @@ def main() -> None: accuracy_result, ep=args.ep, onnx_size_bytes=onnx_size, - sanitize_fn=_sanitize_output, + sanitize_fn=None if args.raw_output else _sanitize_output, ) results.append(result) From 2652af177c0f74f1d88027bcf252a9750e1b2876 Mon Sep 17 00:00:00 2001 From: Qiong Wu Date: Thu, 4 Jun 2026 10:36:06 +0800 Subject: [PATCH 7/7] perf: compile noise patterns into a single regex Replaces per-line linear scan with a pre-compiled regex for better performance on large outputs. --- scripts/e2e_eval/run_eval.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py index 840c5b562..365357c01 100644 --- a/scripts/e2e_eval/run_eval.py +++ b/scripts/e2e_eval/run_eval.py @@ -34,6 +34,7 @@ import logging import os import platform +import re import shutil import subprocess import sys @@ -273,7 +274,7 @@ def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None: # Lines that carry no diagnostic value in eval_result.json. -# Matching is case-insensitive, applied per-line. +# Matching is case-insensitive, anchored at line start. _NOISE_PATTERNS = ( "benchmarking onnx", "device:", @@ -285,6 +286,7 @@ def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None: "outputs:", "samples/sec", ) +_NOISE_RE = re.compile("|".join(re.escape(p) for p in _NOISE_PATTERNS), re.IGNORECASE) # Box-drawing characters used by Rich tables. _BOX_CHARS = frozenset("─│┌┐└┘├┤┬┴┼") @@ -305,8 +307,7 @@ def _sanitize_output(text: str) -> str: # Drop box-drawing table rows if stripped[0] in _BOX_CHARS: continue - low = stripped.lower() - if any(low.startswith(pat) for pat in _NOISE_PATTERNS): + if _NOISE_RE.match(stripped): continue kept.append(stripped) return "\n".join(kept)