Skip to content
109 changes: 108 additions & 1 deletion scripts/e2e_eval/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@
import argparse
import contextlib
import json
import logging
import os
import platform
import re
import shutil
import subprocess
import sys
Expand Down Expand Up @@ -69,6 +71,8 @@
)


logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -233,6 +237,82 @@ def _utc_now() -> str:
return datetime.now(timezone.utc).isoformat()


def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None:
"""Return combined size in bytes of all ONNX files + their external data companions.

Parses the ONNX proto to discover all referenced external data files (not just
the conventional `.data` suffix). Falls back to the `.data` companion heuristic
if proto parsing is unavailable.

Returns None if onnx_paths is empty or no files exist on disk.
"""
if not onnx_paths:
return None
total = 0
found_any = False
for path_str in onnx_paths.values():
p = Path(path_str)
if not p.exists():
continue
total += p.stat().st_size
found_any = True
# Try to enumerate all external data files from the proto
try:
from winml.modelkit.onnx.external_data import get_external_data_files

ext_files = get_external_data_files(p)
for ext_name in ext_files:
ext_path = p.parent / ext_name
if ext_path.exists():
total += ext_path.stat().st_size
except Exception:
# Fallback: check conventional .data companion
data_p = p.with_suffix(p.suffix + ".data")
if data_p.exists():
total += data_p.stat().st_size
return total if found_any else None


# Lines that carry no diagnostic value in eval_result.json.
# Matching is case-insensitive, anchored at line start.
_NOISE_PATTERNS = (
"benchmarking onnx",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a little strange.. any way we add a parameter in eval command to just drop them?

"device:",
"task:",
"latency (ms)",
"throughput:",
"results saved to",
Comment thread
DingmaomaoBJTU marked this conversation as resolved.
"inputs:",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A couple of patterns in _NOISE_PATTERNS can swallow legitimate diagnostic content with the current low.startswith(pat) matching:

  • "inputs:" / "outputs:" — these silently strip lines like Inputs: expected (1, 224, 224, 3), got (1, 3, 224, 224) (a real shape-mismatch error), which is exactly the kind of "error-relevant" content a sanitizer is supposed to keep. Same concern for "device:" if a downstream error ever reads something like Device: <name> is not available, falling back to CPU.
  • "samples/sec" — dead pattern under startswith semantics. Throughput: 80 samples/sec is already covered by "throughput:" above; no winml perf line literally starts with samples/sec.

Tightening options (cheapest first):

  1. Just drop inputs: / outputs: / samples/sec. The remaining patterns are unambiguous CLI chrome.
  2. Switch to exact-prefix-with-boundary: only strip when the line is pat followed by space or end, e.g. low == pat or low.startswith(pat + " "), so error lines that happen to start with Inputs: but carry non-trivial content survive.

🤖 Generated with GitHub Copilot CLI

"outputs:",
"samples/sec",
)
_NOISE_RE = re.compile("|".join(re.escape(p) for p in _NOISE_PATTERNS), re.IGNORECASE)

# Box-drawing characters used by Rich tables.
_BOX_CHARS = frozenset("─│┌┐└┘├┤┬┴┼")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_BOX_CHARS only covers Unicode's LIGHT box-drawing style (─│┌┐└┘├┤┬┴┼), but winml perf uses Rich's default Table(), which renders with box.HEAVY_HEAD. I rendered a default Rich table locally and 3 of the 5 lines start with chars not in this set:

Row First char In _BOX_CHARS?
top border ┏━━━━━┳━━━━━┓ (U+250F)
header row ┃ Avg ┃ P90 ┃ (U+2503)
head separator ┡━━━━━╇━━━━━┩ (U+2521)
data row │ 12.5 │ 15.2 │
bottom border └─────┴─────┘

Net result: eval_result.json keeps the top border + header text + head separator while stripping data rows and the bottom border — arguably uglier than no sanitization at all (orphaned half-table chrome).

Cheap fix — use a Unicode block range check instead of a hand-curated set:

if 0x2500 <= ord(stripped[0]) <= 0x257F:  # Unicode "Box Drawing" block
    continue

That covers all four Rich styles (light, heavy, double, rounded) in one rule and won''t silently drift the next time someone changes the table style.

🤖 Generated with GitHub Copilot CLI



def _sanitize_output(text: str) -> str:
"""Strip routine CLI chrome from subprocess output, keeping error content.

Removes Rich benchmark tables, device/IO banners, and path lines that
bloat eval_result.json without aiding failure diagnosis. All classifier
patterns (see classifier.py) are error-related and survive this filter.
"""
kept: list[str] = []
for line in text.splitlines():
stripped = line.strip()
if not stripped:
continue
Comment thread
DingmaomaoBJTU marked this conversation as resolved.
# Drop box-drawing table rows
if stripped[0] in _BOX_CHARS:
continue
if _NOISE_RE.match(stripped):
continue
kept.append(stripped)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Appending stripped (post-.strip()) discards leading indentation, which destroys the structure of multi-line errors — the very content the docstring promises to preserve ("All classifier patterns are error-related and survive this filter"). For example, a traceback:

  File "foo.py", line 5, in bar
    raise RuntimeError("x")

becomes a visually-flat:

File "foo.py", line 5, in bar
raise RuntimeError("x")

which is noticeably harder to read.

Suggested fix — keep stripped only for the box/noise classifier checks, then append the original line (lightly trimmed):

if not stripped:
    continue
if 0x2500 <= ord(stripped[0]) <= 0x257F:
    continue
low = stripped.lower()
if any(low.startswith(pat) for pat in _NOISE_PATTERNS):
    continue
kept.append(line.rstrip())

🤖 Generated with GitHub Copilot CLI

return "\n".join(kept)


def _kill_process_tree(pid: int) -> None:
"""Kill a process and all its children.

Comment thread
DingmaomaoBJTU marked this conversation as resolved.
Expand Down Expand Up @@ -1028,6 +1108,20 @@ def save_environment_info(path: Path) -> None:
except (subprocess.TimeoutExpired, FileNotFoundError):
pass # git not available or timed out; commit info stays empty

# `winml sys --format json` captures hardware details (devices, EPs,
# backends) that the lightweight package-version probes above miss.
try:
result = subprocess.run( # noqa: S603
[sys.executable, "-m", "winml", "sys", "--format", "json"],
capture_output=True,
text=True,
timeout=30,
Comment thread
DingmaomaoBJTU marked this conversation as resolved.
)
if result.returncode == 0:
info["winml_sys"] = json.loads(result.stdout)
Comment thread
DingmaomaoBJTU marked this conversation as resolved.
except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError) as exc:
logger.debug("winml sys skipped: %s", exc)

path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(info, indent=2), encoding="utf-8")

Expand Down Expand Up @@ -1139,6 +1233,11 @@ def parse_args() -> argparse.Namespace:
help="Skip report generation (useful when running per-model in a pipeline loop)",
)
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
parser.add_argument(
"--raw-output",
action="store_true",
help="Keep raw subprocess output in eval_result.json without sanitization",
)
parser.add_argument(
"--continue",
dest="continue_run",
Expand Down Expand Up @@ -1399,6 +1498,7 @@ def main() -> None:
ep=args.ep,
)
onnx_paths = build_result["onnx_paths"] if build_result["success"] else {}
onnx_size = _compute_onnx_size(onnx_paths)

if not build_result["success"]:
# Build failed — synthesize failed result for downstream phases
Expand Down Expand Up @@ -1443,7 +1543,14 @@ def main() -> None:
break

result = build_eval_result(
entry, perf_proc, args.device, eval_types_run, accuracy_result, ep=args.ep
entry,
perf_proc,
args.device,
eval_types_run,
accuracy_result,
ep=args.ep,
onnx_size_bytes=onnx_size,
sanitize_fn=None if args.raw_output else _sanitize_output,
)
results.append(result)

Expand Down
29 changes: 23 additions & 6 deletions scripts/e2e_eval/utils/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@


if TYPE_CHECKING:
from collections.abc import Callable

from .registry import ModelEntry


Expand All @@ -38,6 +40,8 @@ def build_eval_result(
eval_types_run: list[str],
accuracy_result: dict | None = None,
ep: str | None = None,
onnx_size_bytes: int | None = None,
sanitize_fn: Callable[[str], str] | None = None,
) -> dict:
"""Build a unified eval_result dict (facts only, no derived fields).

Expand All @@ -46,16 +50,28 @@ def build_eval_result(
accuracy_result is the accuracy sub-section dict (or None if not run).
ep is the explicit execution provider (e.g., "qnn", "dml"), or None when
not specified (device-to-provider mapping was used).
onnx_size_bytes is the combined size of the exported ONNX + .data files.
sanitize_fn, when provided, is applied to stdout/stderr to remove noise.
"""
perf_section: dict | None = None
if perf_proc is not None:
passed = perf_proc["exit_code"] == 0
raw_stdout = perf_proc["stdout"]
raw_stderr = perf_proc["stderr"]
if sanitize_fn is not None:
stdout = sanitize_fn(raw_stdout)
stderr = sanitize_fn(raw_stderr)
else:
stdout = raw_stdout
stderr = raw_stderr
perf_section = {
"passed": passed,
"elapsed": perf_proc["elapsed"],
"exit_code": perf_proc["exit_code"],
"stdout_output": perf_proc["stdout"],
"stderr_output": perf_proc["stderr"],
"stdout_output": stdout,
"stderr_output": stderr,
"raw_stdout": raw_stdout,
"raw_stderr": raw_stderr,
"timeout": perf_proc["timeout"],
"command": perf_proc["command"],
"error": perf_proc.get("error_summary", ""),
Expand All @@ -76,6 +92,8 @@ def build_eval_result(
"accuracy": accuracy_result,
}
# Optional fields: only include when explicitly provided by the user.
if onnx_size_bytes is not None:
result["onnx_size_bytes"] = onnx_size_bytes
if ep is not None:
result["ep"] = ep
return result
Expand Down Expand Up @@ -323,8 +341,9 @@ def generate_html_report(
output_path: Path,
registry_path: Path | None = None,
) -> None:
from .accuracy import format_delta
"""Generate interactive HTML report with Perf and Accuracy tabs."""
from .accuracy import format_delta

Comment thread
DingmaomaoBJTU marked this conversation as resolved.
Comment thread
DingmaomaoBJTU marked this conversation as resolved.
results = report_data.get("results", [])

# Load registry for enrichment
Expand Down Expand Up @@ -366,9 +385,7 @@ def generate_html_report(
if acc is not None
else None
),
"delta_display": (
format_delta(acc) if acc and not acc.get("skipped") else ""
),
"delta_display": (format_delta(acc) if acc and not acc.get("skipped") else ""),
"metric": (
{
"name": (acc.get("winml_metric") or {}).get("metric"),
Expand Down
43 changes: 40 additions & 3 deletions tests/unit/eval/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,7 @@ def test_feature_extraction_mapped_to_hf_image_feature_extraction_for_vision_mod
fake_onnx_config = MagicMock()
fake_onnx_config.inputs = {"pixel_values": object()}

config = WinMLEvaluationConfig(
model_id="facebook/dinov2-base", task="feature-extraction"
)
config = WinMLEvaluationConfig(model_id="facebook/dinov2-base", task="feature-extraction")
with (
patch(
"transformers.AutoConfig.from_pretrained",
Expand Down Expand Up @@ -1158,6 +1156,45 @@ def test_ep_present_when_provided(self):
)
assert result["ep"] == "qnn"

def test_sanitize_fn_preserves_raw_perf_output(self):
reporter = self._load_reporter()

perf_proc = {
"exit_code": 0,
"stdout": "Latency (ms): 12.5\nThroughput: 80 samples/sec\nsome error line",
"stderr": "warning: device busy",
"elapsed": 5.0,
"timeout": False,
"command": "winml perf",
"timestamp": "2026-01-01T00:00:00+00:00",
}

def strip_perf(text: str) -> str:
return "\n".join(
line
for line in text.splitlines()
if "latency" not in line.lower() and "throughput" not in line.lower()
)

result = reporter.build_eval_result(
entry=self._make_entry(),
perf_proc=perf_proc,
device="cpu",
eval_types_run=["perf"],
accuracy_result=None,
ep=None,
sanitize_fn=strip_perf,
)

perf = result["perf"]
# sanitized output should not contain latency/throughput lines
assert "Latency" not in perf["stdout_output"]
assert "Throughput" not in perf["stdout_output"]
# raw output preserves the original perf data
assert "Latency (ms): 12.5" in perf["raw_stdout"]
assert "Throughput: 80 samples/sec" in perf["raw_stdout"]
assert perf["raw_stderr"] == "warning: device busy"


class TestDefaultDatasetImmutability:
"""Tests that module-level _DEFAULT_DATASETS are not corrupted."""
Expand Down
Loading