From 0b3eaac0903039208f1234fb7d429fe5d9f61004 Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Tue, 26 May 2026 17:36:04 +0800
Subject: [PATCH 1/7] update eval scripts: add ONNX size tracking and output
 sanitization

- Add _compute_onnx_size() to measure combined ONNX + .data file sizes
- Add _sanitize_output() to strip CLI chrome (Rich tables, banners) from
  eval_result.json, keeping only error-relevant content
- Pass onnx_size_bytes and sanitize_fn through to build_eval_result()
- Minor formatting fixes in reporter.py
---
 scripts/e2e_eval/run_eval.py       | 70 +++++++++++++++++++++++++++++-
 scripts/e2e_eval/utils/reporter.py | 22 +++++++---
 2 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index ae50351d1..f27165d98 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -233,6 +233,66 @@ def _utc_now() -> str:
     return datetime.now(timezone.utc).isoformat()
 
 
+def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None:
+    """Return combined size in bytes of all ONNX files + their .data companions.
+
+    Returns None if onnx_paths is empty or no files exist on disk.
+    """
+    if not onnx_paths:
+        return None
+    total = 0
+    found_any = False
+    for path_str in onnx_paths.values():
+        p = Path(path_str)
+        if p.exists():
+            total += p.stat().st_size
+            found_any = True
+        data_p = Path(path_str + ".data")
+        if data_p.exists():
+            total += data_p.stat().st_size
+    return total if found_any else None
+
+
+# Lines that carry no diagnostic value in eval_result.json.
+# Matching is case-insensitive, applied per-line.
+_NOISE_PATTERNS = (
+    "benchmarking onnx",
+    "device:",
+    "task:",
+    "latency (ms)",
+    "throughput:",
+    "results saved to",
+    "inputs:",
+    "outputs:",
+    "samples/sec",
+)
+
+# Box-drawing characters used by Rich tables.
+_BOX_CHARS = frozenset("─│┌┐└┘├┤┬┴┼")
+
+
+def _sanitize_output(text: str) -> str:
+    """Strip routine CLI chrome from subprocess output, keeping error content.
+
+    Removes Rich benchmark tables, device/IO banners, and path lines that
+    bloat eval_result.json without aiding failure diagnosis. All classifier
+    patterns (see classifier.py) are error-related and survive this filter.
+    """
+    kept: list[str] = []
+    for line in text.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        # Drop box-drawing table rows
+        if stripped[0] in _BOX_CHARS:
+            continue
+        low = stripped.lower()
+        if any(pat in low for pat in _NOISE_PATTERNS):
+            continue
+        kept.append(stripped)
+    return "\n".join(kept)
+
+
 def _kill_process_tree(pid: int) -> None:
     """Kill a process and all its children.
 
@@ -1395,6 +1455,7 @@ def main() -> None:
                 ep=args.ep,
             )
             onnx_paths = build_result["onnx_paths"] if build_result["success"] else {}
+            onnx_size = _compute_onnx_size(onnx_paths)
 
             if not build_result["success"]:
                 # Build failed — synthesize failed result for downstream phases
@@ -1439,7 +1500,14 @@ def main() -> None:
             break
 
         result = build_eval_result(
-            entry, perf_proc, args.device, eval_types_run, accuracy_result, ep=args.ep
+            entry,
+            perf_proc,
+            args.device,
+            eval_types_run,
+            accuracy_result,
+            ep=args.ep,
+            onnx_size_bytes=onnx_size,
+            sanitize_fn=_sanitize_output,
         )
         results.append(result)
 
diff --git a/scripts/e2e_eval/utils/reporter.py b/scripts/e2e_eval/utils/reporter.py
index a97fef69b..0a16a51fe 100644
--- a/scripts/e2e_eval/utils/reporter.py
+++ b/scripts/e2e_eval/utils/reporter.py
@@ -23,6 +23,8 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from .registry import ModelEntry
 
 
@@ -38,6 +40,8 @@ def build_eval_result(
     eval_types_run: list[str],
     accuracy_result: dict | None = None,
     ep: str | None = None,
+    onnx_size_bytes: int | None = None,
+    sanitize_fn: Callable[[str], str] | None = None,
 ) -> dict:
     """Build a unified eval_result dict (facts only, no derived fields).
 
@@ -46,16 +50,23 @@ def build_eval_result(
     accuracy_result is the accuracy sub-section dict (or None if not run).
     ep is the explicit execution provider (e.g., "qnn", "dml"), or None when
     not specified (device-to-provider mapping was used).
+    onnx_size_bytes is the combined size of the exported ONNX + .data files.
+    sanitize_fn, when provided, is applied to stdout/stderr to remove noise.
     """
     perf_section: dict | None = None
     if perf_proc is not None:
         passed = perf_proc["exit_code"] == 0
+        stdout = perf_proc["stdout"]
+        stderr = perf_proc["stderr"]
+        if sanitize_fn is not None:
+            stdout = sanitize_fn(stdout)
+            stderr = sanitize_fn(stderr)
         perf_section = {
             "passed": passed,
             "elapsed": perf_proc["elapsed"],
             "exit_code": perf_proc["exit_code"],
-            "stdout_output": perf_proc["stdout"],
-            "stderr_output": perf_proc["stderr"],
+            "stdout_output": stdout,
+            "stderr_output": stderr,
             "timeout": perf_proc["timeout"],
             "command": perf_proc["command"],
             "error": perf_proc.get("error_summary", ""),
@@ -76,6 +87,8 @@ def build_eval_result(
         "accuracy": accuracy_result,
     }
     # Optional fields: only include when explicitly provided by the user.
+    if onnx_size_bytes is not None:
+        result["onnx_size_bytes"] = onnx_size_bytes
     if ep is not None:
         result["ep"] = ep
     return result
@@ -324,6 +337,7 @@ def generate_html_report(
     registry_path: Path | None = None,
 ) -> None:
     from .accuracy import format_delta
+
     """Generate interactive HTML report with Perf and Accuracy tabs."""
     results = report_data.get("results", [])
 
@@ -366,9 +380,7 @@ def generate_html_report(
                     if acc is not None
                     else None
                 ),
-                "delta_display": (
-                    format_delta(acc) if acc and not acc.get("skipped") else ""
-                ),
+                "delta_display": (format_delta(acc) if acc and not acc.get("skipped") else ""),
                 "metric": (
                     {
                         "name": (acc.get("winml_metric") or {}).get("metric"),

From 9c152a0d957664a18b4ec3fa3edfa902b5aaf2de Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Tue, 26 May 2026 17:40:21 +0800
Subject: [PATCH 2/7] add winml sys output to environment.json

Capture hardware details (devices, EPs, backends) by running
`winml sys --format json` and embedding the result under the
`winml_sys` key in environment.json.
---
 scripts/e2e_eval/run_eval.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index f27165d98..599140d3e 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -1084,6 +1084,20 @@ def save_environment_info(path: Path) -> None:
     except (subprocess.TimeoutExpired, FileNotFoundError):
         pass  # git not available or timed out; commit info stays empty
 
+    # `winml sys --format json` captures hardware details (devices, EPs,
+    # backends) that the lightweight package-version probes above miss.
+    try:
+        result = subprocess.run(  # noqa: S603
+            [sys.executable, "-m", "winml", "sys", "--format", "json"],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        if result.returncode == 0:
+            info["winml_sys"] = json.loads(result.stdout)
+    except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError):
+        pass
+
     path.parent.mkdir(parents=True, exist_ok=True)
     path.write_text(json.dumps(info, indent=2), encoding="utf-8")
 

From 7d00b37f09a44933ca618767b51627fcf8971315 Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Fri, 29 May 2026 16:32:03 +0800
Subject: [PATCH 3/7] fix: preserve raw perf data before output sanitization

The sanitize_fn strips perf metric lines (latency, throughput, etc.)
from stdout/stderr. Store the original output in raw_stdout/raw_stderr
fields so downstream consumers can still access the full perf data.
---
 scripts/e2e_eval/utils/reporter.py | 13 +++++++----
 tests/unit/eval/test_eval.py       | 37 ++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/scripts/e2e_eval/utils/reporter.py b/scripts/e2e_eval/utils/reporter.py
index 0a16a51fe..2cd95367d 100644
--- a/scripts/e2e_eval/utils/reporter.py
+++ b/scripts/e2e_eval/utils/reporter.py
@@ -56,17 +56,22 @@ def build_eval_result(
     perf_section: dict | None = None
     if perf_proc is not None:
         passed = perf_proc["exit_code"] == 0
-        stdout = perf_proc["stdout"]
-        stderr = perf_proc["stderr"]
+        raw_stdout = perf_proc["stdout"]
+        raw_stderr = perf_proc["stderr"]
         if sanitize_fn is not None:
-            stdout = sanitize_fn(stdout)
-            stderr = sanitize_fn(stderr)
+            stdout = sanitize_fn(raw_stdout)
+            stderr = sanitize_fn(raw_stderr)
+        else:
+            stdout = raw_stdout
+            stderr = raw_stderr
         perf_section = {
             "passed": passed,
             "elapsed": perf_proc["elapsed"],
             "exit_code": perf_proc["exit_code"],
             "stdout_output": stdout,
             "stderr_output": stderr,
+            "raw_stdout": raw_stdout,
+            "raw_stderr": raw_stderr,
             "timeout": perf_proc["timeout"],
             "command": perf_proc["command"],
             "error": perf_proc.get("error_summary", ""),
diff --git a/tests/unit/eval/test_eval.py b/tests/unit/eval/test_eval.py
index c7ca4af6d..427c6603e 100644
--- a/tests/unit/eval/test_eval.py
+++ b/tests/unit/eval/test_eval.py
@@ -1067,6 +1067,43 @@ def test_ep_present_when_provided(self):
         )
         assert result["ep"] == "qnn"
 
+    def test_sanitize_fn_preserves_raw_perf_output(self):
+        reporter = self._load_reporter()
+
+        perf_proc = {
+            "exit_code": 0,
+            "stdout": "Latency (ms): 12.5\nThroughput: 80 samples/sec\nsome error line",
+            "stderr": "warning: device busy",
+            "elapsed": 5.0,
+            "timeout": False,
+            "command": "winml perf",
+            "timestamp": "2026-01-01T00:00:00+00:00",
+        }
+
+        def strip_perf(text: str) -> str:
+            return "\n".join(
+                l for l in text.splitlines() if "latency" not in l.lower() and "throughput" not in l.lower()
+            )
+
+        result = reporter.build_eval_result(
+            entry=self._make_entry(),
+            perf_proc=perf_proc,
+            device="cpu",
+            eval_types_run=["perf"],
+            accuracy_result=None,
+            ep=None,
+            sanitize_fn=strip_perf,
+        )
+
+        perf = result["perf"]
+        # sanitized output should not contain latency/throughput lines
+        assert "Latency" not in perf["stdout_output"]
+        assert "Throughput" not in perf["stdout_output"]
+        # raw output preserves the original perf data
+        assert "Latency (ms): 12.5" in perf["raw_stdout"]
+        assert "Throughput: 80 samples/sec" in perf["raw_stdout"]
+        assert perf["raw_stderr"] == "warning: device busy"
+
 
 class TestDefaultDatasetImmutability:
     """Tests that module-level _DEFAULT_DATASETS are not corrupted."""

From f21a12366e0891e1913dd15e6504fe1c03189dff Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Fri, 29 May 2026 16:33:44 +0800
Subject: [PATCH 4/7] fix: add explanatory comment to except clause (CodeQL)

---
 scripts/e2e_eval/run_eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index 599140d3e..555573977 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -1096,7 +1096,7 @@ def save_environment_info(path: Path) -> None:
         if result.returncode == 0:
             info["winml_sys"] = json.loads(result.stdout)
     except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError):
-        pass
+        pass  # winml sys unavailable or returned invalid JSON; skip optional field
 
     path.parent.mkdir(parents=True, exist_ok=True)
     path.write_text(json.dumps(info, indent=2), encoding="utf-8")

From b3e0cf9614c8ce504ccf82f978b9954aa9768c0d Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Tue, 2 Jun 2026 20:41:24 +0800
Subject: [PATCH 5/7] fix: address PR review comments

- Fix displaced docstring in generate_html_report (was after import)
- Anchor _sanitize_output patterns to line start to avoid stripping
  legitimate error messages containing pattern substrings
- Use idiomatic pathlib for .data companion file construction
---
 scripts/e2e_eval/run_eval.py       | 6 +++---
 scripts/e2e_eval/utils/reporter.py | 2 +-
 tests/unit/eval/test_eval.py       | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index 555573977..c1a7a206f 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -247,7 +247,7 @@ def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None:
         if p.exists():
             total += p.stat().st_size
             found_any = True
-        data_p = Path(path_str + ".data")
+        data_p = p.with_suffix(p.suffix + ".data")
         if data_p.exists():
             total += data_p.stat().st_size
     return total if found_any else None
@@ -287,7 +287,7 @@ def _sanitize_output(text: str) -> str:
         if stripped[0] in _BOX_CHARS:
             continue
         low = stripped.lower()
-        if any(pat in low for pat in _NOISE_PATTERNS):
+        if any(low.startswith(pat) for pat in _NOISE_PATTERNS):
             continue
         kept.append(stripped)
     return "\n".join(kept)
@@ -1087,7 +1087,7 @@ def save_environment_info(path: Path) -> None:
     # `winml sys --format json` captures hardware details (devices, EPs,
     # backends) that the lightweight package-version probes above miss.
     try:
-        result = subprocess.run(  # noqa: S603
+        result = subprocess.run(
             [sys.executable, "-m", "winml", "sys", "--format", "json"],
             capture_output=True,
             text=True,
diff --git a/scripts/e2e_eval/utils/reporter.py b/scripts/e2e_eval/utils/reporter.py
index 2cd95367d..5da4f75ad 100644
--- a/scripts/e2e_eval/utils/reporter.py
+++ b/scripts/e2e_eval/utils/reporter.py
@@ -341,9 +341,9 @@ def generate_html_report(
     output_path: Path,
     registry_path: Path | None = None,
 ) -> None:
+    """Generate interactive HTML report with Perf and Accuracy tabs."""
     from .accuracy import format_delta
 
-    """Generate interactive HTML report with Perf and Accuracy tabs."""
     results = report_data.get("results", [])
 
     # Load registry for enrichment
diff --git a/tests/unit/eval/test_eval.py b/tests/unit/eval/test_eval.py
index 427c6603e..e6c53021d 100644
--- a/tests/unit/eval/test_eval.py
+++ b/tests/unit/eval/test_eval.py
@@ -91,9 +91,7 @@ def test_feature_extraction_mapped_to_hf_image_feature_extraction_for_vision_mod
         fake_onnx_config = MagicMock()
         fake_onnx_config.inputs = {"pixel_values": object()}
 
-        config = WinMLEvaluationConfig(
-            model_id="facebook/dinov2-base", task="feature-extraction"
-        )
+        config = WinMLEvaluationConfig(model_id="facebook/dinov2-base", task="feature-extraction")
         with (
             patch(
                 "transformers.AutoConfig.from_pretrained",
@@ -1082,7 +1080,9 @@ def test_sanitize_fn_preserves_raw_perf_output(self):
 
         def strip_perf(text: str) -> str:
             return "\n".join(
-                l for l in text.splitlines() if "latency" not in l.lower() and "throughput" not in l.lower()
+                line
+                for line in text.splitlines()
+                if "latency" not in line.lower() and "throughput" not in line.lower()
             )
 
         result = reporter.build_eval_result(

From 9154eff238fbc98c4eb4f18b640aff4c2e88e314 Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Thu, 4 Jun 2026 10:34:00 +0800
Subject: [PATCH 6/7] fix: address PR review comments

- Improve _compute_onnx_size to parse ONNX proto for all external data
  files instead of only checking the conventional .data suffix
- Add debug logging when winml sys times out or fails (replaces bare pass)
- Add --raw-output flag to skip output sanitization in eval_result.json
---
 scripts/e2e_eval/run_eval.py | 46 +++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index cf790f6b2..840c5b562 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -31,6 +31,7 @@
 import argparse
 import contextlib
 import json
+import logging
 import os
 import platform
 import shutil
@@ -69,6 +70,8 @@
 )
 
 
+logger = logging.getLogger(__name__)
+
 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
@@ -234,7 +237,11 @@ def _utc_now() -> str:
 
 
 def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None:
-    """Return combined size in bytes of all ONNX files + their .data companions.
+    """Return combined size in bytes of all ONNX files + their external data companions.
+
+    Parses the ONNX proto to discover all referenced external data files (not just
+    the conventional `.data` suffix). Falls back to the `.data` companion heuristic
+    if proto parsing is unavailable.
 
     Returns None if onnx_paths is empty or no files exist on disk.
     """
@@ -244,12 +251,24 @@ def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None:
     found_any = False
     for path_str in onnx_paths.values():
         p = Path(path_str)
-        if p.exists():
-            total += p.stat().st_size
-            found_any = True
-        data_p = p.with_suffix(p.suffix + ".data")
-        if data_p.exists():
-            total += data_p.stat().st_size
+        if not p.exists():
+            continue
+        total += p.stat().st_size
+        found_any = True
+        # Try to enumerate all external data files from the proto
+        try:
+            from winml.modelkit.onnx.external_data import get_external_data_files
+
+            ext_files = get_external_data_files(p)
+            for ext_name in ext_files:
+                ext_path = p.parent / ext_name
+                if ext_path.exists():
+                    total += ext_path.stat().st_size
+        except Exception:
+            # Fallback: check conventional .data companion
+            data_p = p.with_suffix(p.suffix + ".data")
+            if data_p.exists():
+                total += data_p.stat().st_size
     return total if found_any else None
 
 
@@ -1091,7 +1110,7 @@ def save_environment_info(path: Path) -> None:
     # `winml sys --format json` captures hardware details (devices, EPs,
     # backends) that the lightweight package-version probes above miss.
     try:
-        result = subprocess.run(
+        result = subprocess.run(  # noqa: S603
             [sys.executable, "-m", "winml", "sys", "--format", "json"],
             capture_output=True,
             text=True,
@@ -1099,8 +1118,8 @@ def save_environment_info(path: Path) -> None:
         )
         if result.returncode == 0:
             info["winml_sys"] = json.loads(result.stdout)
-    except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError):
-        pass  # winml sys unavailable or returned invalid JSON; skip optional field
+    except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError) as exc:
+        logger.debug("winml sys skipped: %s", exc)
 
     path.parent.mkdir(parents=True, exist_ok=True)
     path.write_text(json.dumps(info, indent=2), encoding="utf-8")
@@ -1213,6 +1232,11 @@ def parse_args() -> argparse.Namespace:
         help="Skip report generation (useful when running per-model in a pipeline loop)",
     )
     parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--raw-output",
+        action="store_true",
+        help="Keep raw subprocess output in eval_result.json without sanitization",
+    )
     parser.add_argument(
         "--continue",
         dest="continue_run",
@@ -1525,7 +1549,7 @@ def main() -> None:
             accuracy_result,
             ep=args.ep,
             onnx_size_bytes=onnx_size,
-            sanitize_fn=_sanitize_output,
+            sanitize_fn=None if args.raw_output else _sanitize_output,
         )
         results.append(result)
 

From 2652af177c0f74f1d88027bcf252a9750e1b2876 Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Thu, 4 Jun 2026 10:36:06 +0800
Subject: [PATCH 7/7] perf: compile noise patterns into a single regex

Replaces per-line linear scan with a pre-compiled regex for better
performance on large outputs.
---
 scripts/e2e_eval/run_eval.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index 840c5b562..365357c01 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -34,6 +34,7 @@
 import logging
 import os
 import platform
+import re
 import shutil
 import subprocess
 import sys
@@ -273,7 +274,7 @@ def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None:
 
 
 # Lines that carry no diagnostic value in eval_result.json.
-# Matching is case-insensitive, applied per-line.
+# Matching is case-insensitive, anchored at line start.
 _NOISE_PATTERNS = (
     "benchmarking onnx",
     "device:",
@@ -285,6 +286,7 @@ def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None:
     "outputs:",
     "samples/sec",
 )
+_NOISE_RE = re.compile("|".join(re.escape(p) for p in _NOISE_PATTERNS), re.IGNORECASE)
 
 # Box-drawing characters used by Rich tables.
 _BOX_CHARS = frozenset("─│┌┐└┘├┤┬┴┼")
@@ -305,8 +307,7 @@ def _sanitize_output(text: str) -> str:
         # Drop box-drawing table rows
         if stripped[0] in _BOX_CHARS:
             continue
-        low = stripped.lower()
-        if any(low.startswith(pat) for pat in _NOISE_PATTERNS):
+        if _NOISE_RE.match(stripped):
             continue
         kept.append(stripped)
     return "\n".join(kept)