diff --git a/scripts/e2e_eval/run_eval.py b/scripts/e2e_eval/run_eval.py
index d47b6d3b1..365357c01 100644
--- a/scripts/e2e_eval/run_eval.py
+++ b/scripts/e2e_eval/run_eval.py
@@ -31,8 +31,10 @@
 import argparse
 import contextlib
 import json
+import logging
 import os
 import platform
+import re
 import shutil
 import subprocess
 import sys
@@ -69,6 +71,8 @@
 )
 
 
+logger = logging.getLogger(__name__)
+
 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
@@ -233,6 +237,82 @@ def _utc_now() -> str:
     return datetime.now(timezone.utc).isoformat()
 
 
+def _compute_onnx_size(onnx_paths: dict[str, str]) -> int | None:
+    """Return combined size in bytes of all ONNX files + their external data companions.
+
+    Parses the ONNX proto to discover all referenced external data files (not just
+    the conventional `.data` suffix). Falls back to the `.data` companion heuristic
+    if proto parsing is unavailable.
+
+    Returns None if onnx_paths is empty or no files exist on disk.
+    """
+    if not onnx_paths:
+        return None
+    total = 0
+    found_any = False
+    for path_str in onnx_paths.values():
+        p = Path(path_str)
+        if not p.exists():
+            continue
+        total += p.stat().st_size
+        found_any = True
+        # Try to enumerate all external data files from the proto
+        try:
+            from winml.modelkit.onnx.external_data import get_external_data_files
+
+            ext_files = get_external_data_files(p)
+            for ext_name in ext_files:
+                ext_path = p.parent / ext_name
+                if ext_path.exists():
+                    total += ext_path.stat().st_size
+        except Exception:
+            # Fallback: check conventional .data companion
+            data_p = p.with_suffix(p.suffix + ".data")
+            if data_p.exists():
+                total += data_p.stat().st_size
+    return total if found_any else None
+
+
+# Lines that carry no diagnostic value in eval_result.json.
+# Matching is case-insensitive, anchored at line start.
+_NOISE_PATTERNS = (
+    "benchmarking onnx",
+    "device:",
+    "task:",
+    "latency (ms)",
+    "throughput:",
+    "results saved to",
+    "inputs:",
+    "outputs:",
+    "samples/sec",
+)
+_NOISE_RE = re.compile("|".join(re.escape(p) for p in _NOISE_PATTERNS), re.IGNORECASE)
+
+# Box-drawing characters used by Rich tables.
+_BOX_CHARS = frozenset("─│┌┐└┘├┤┬┴┼")
+
+
+def _sanitize_output(text: str) -> str:
+    """Strip routine CLI chrome from subprocess output, keeping error content.
+
+    Removes Rich benchmark tables, device/IO banners, and path lines that
+    bloat eval_result.json without aiding failure diagnosis. All classifier
+    patterns (see classifier.py) are error-related and survive this filter.
+    """
+    kept: list[str] = []
+    for line in text.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        # Drop box-drawing table rows
+        if stripped[0] in _BOX_CHARS:
+            continue
+        if _NOISE_RE.match(stripped):
+            continue
+        kept.append(stripped)
+    return "\n".join(kept)
+
+
 def _kill_process_tree(pid: int) -> None:
     """Kill a process and all its children.
 
@@ -1028,6 +1108,20 @@ def save_environment_info(path: Path) -> None:
     except (subprocess.TimeoutExpired, FileNotFoundError):
         pass  # git not available or timed out; commit info stays empty
 
+    # `winml sys --format json` captures hardware details (devices, EPs,
+    # backends) that the lightweight package-version probes above miss.
+    try:
+        result = subprocess.run(  # noqa: S603
+            [sys.executable, "-m", "winml", "sys", "--format", "json"],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        if result.returncode == 0:
+            info["winml_sys"] = json.loads(result.stdout)
+    except (subprocess.TimeoutExpired, FileNotFoundError, json.JSONDecodeError) as exc:
+        logger.debug("winml sys skipped: %s", exc)
+
     path.parent.mkdir(parents=True, exist_ok=True)
     path.write_text(json.dumps(info, indent=2), encoding="utf-8")
 
@@ -1139,6 +1233,11 @@ def parse_args() -> argparse.Namespace:
         help="Skip report generation (useful when running per-model in a pipeline loop)",
     )
     parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--raw-output",
+        action="store_true",
+        help="Keep raw subprocess output in eval_result.json without sanitization",
+    )
     parser.add_argument(
         "--continue",
         dest="continue_run",
@@ -1399,6 +1498,7 @@ def main() -> None:
                 ep=args.ep,
             )
             onnx_paths = build_result["onnx_paths"] if build_result["success"] else {}
+            onnx_size = _compute_onnx_size(onnx_paths)
 
             if not build_result["success"]:
                 # Build failed — synthesize failed result for downstream phases
@@ -1443,7 +1543,14 @@ def main() -> None:
             break
 
         result = build_eval_result(
-            entry, perf_proc, args.device, eval_types_run, accuracy_result, ep=args.ep
+            entry,
+            perf_proc,
+            args.device,
+            eval_types_run,
+            accuracy_result,
+            ep=args.ep,
+            onnx_size_bytes=onnx_size,
+            sanitize_fn=None if args.raw_output else _sanitize_output,
         )
         results.append(result)
 
diff --git a/scripts/e2e_eval/utils/reporter.py b/scripts/e2e_eval/utils/reporter.py
index a97fef69b..5da4f75ad 100644
--- a/scripts/e2e_eval/utils/reporter.py
+++ b/scripts/e2e_eval/utils/reporter.py
@@ -23,6 +23,8 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from .registry import ModelEntry
 
 
@@ -38,6 +40,8 @@ def build_eval_result(
     eval_types_run: list[str],
     accuracy_result: dict | None = None,
     ep: str | None = None,
+    onnx_size_bytes: int | None = None,
+    sanitize_fn: Callable[[str], str] | None = None,
 ) -> dict:
     """Build a unified eval_result dict (facts only, no derived fields).
 
@@ -46,16 +50,28 @@ def build_eval_result(
     accuracy_result is the accuracy sub-section dict (or None if not run).
     ep is the explicit execution provider (e.g., "qnn", "dml"), or None when
     not specified (device-to-provider mapping was used).
+    onnx_size_bytes is the combined size of the exported ONNX + .data files.
+    sanitize_fn, when provided, is applied to stdout/stderr to remove noise.
     """
     perf_section: dict | None = None
     if perf_proc is not None:
         passed = perf_proc["exit_code"] == 0
+        raw_stdout = perf_proc["stdout"]
+        raw_stderr = perf_proc["stderr"]
+        if sanitize_fn is not None:
+            stdout = sanitize_fn(raw_stdout)
+            stderr = sanitize_fn(raw_stderr)
+        else:
+            stdout = raw_stdout
+            stderr = raw_stderr
         perf_section = {
             "passed": passed,
             "elapsed": perf_proc["elapsed"],
             "exit_code": perf_proc["exit_code"],
-            "stdout_output": perf_proc["stdout"],
-            "stderr_output": perf_proc["stderr"],
+            "stdout_output": stdout,
+            "stderr_output": stderr,
+            "raw_stdout": raw_stdout,
+            "raw_stderr": raw_stderr,
             "timeout": perf_proc["timeout"],
             "command": perf_proc["command"],
             "error": perf_proc.get("error_summary", ""),
@@ -76,6 +92,8 @@ def build_eval_result(
         "accuracy": accuracy_result,
     }
     # Optional fields: only include when explicitly provided by the user.
+    if onnx_size_bytes is not None:
+        result["onnx_size_bytes"] = onnx_size_bytes
     if ep is not None:
         result["ep"] = ep
     return result
@@ -323,8 +341,9 @@ def generate_html_report(
     output_path: Path,
     registry_path: Path | None = None,
 ) -> None:
-    from .accuracy import format_delta
     """Generate interactive HTML report with Perf and Accuracy tabs."""
+    from .accuracy import format_delta
+
     results = report_data.get("results", [])
 
     # Load registry for enrichment
@@ -366,9 +385,7 @@ def generate_html_report(
                     if acc is not None
                     else None
                 ),
-                "delta_display": (
-                    format_delta(acc) if acc and not acc.get("skipped") else ""
-                ),
+                "delta_display": (format_delta(acc) if acc and not acc.get("skipped") else ""),
                 "metric": (
                     {
                         "name": (acc.get("winml_metric") or {}).get("metric"),
diff --git a/tests/unit/eval/test_eval.py b/tests/unit/eval/test_eval.py
index 568df6dbb..86b5426ff 100644
--- a/tests/unit/eval/test_eval.py
+++ b/tests/unit/eval/test_eval.py
@@ -110,9 +110,7 @@ def test_feature_extraction_mapped_to_hf_image_feature_extraction_for_vision_mod
         fake_onnx_config = MagicMock()
         fake_onnx_config.inputs = {"pixel_values": object()}
 
-        config = WinMLEvaluationConfig(
-            model_id="facebook/dinov2-base", task="feature-extraction"
-        )
+        config = WinMLEvaluationConfig(model_id="facebook/dinov2-base", task="feature-extraction")
         with (
             patch(
                 "transformers.AutoConfig.from_pretrained",
@@ -1158,6 +1156,45 @@ def test_ep_present_when_provided(self):
         )
         assert result["ep"] == "qnn"
 
+    def test_sanitize_fn_preserves_raw_perf_output(self):
+        reporter = self._load_reporter()
+
+        perf_proc = {
+            "exit_code": 0,
+            "stdout": "Latency (ms): 12.5\nThroughput: 80 samples/sec\nsome error line",
+            "stderr": "warning: device busy",
+            "elapsed": 5.0,
+            "timeout": False,
+            "command": "winml perf",
+            "timestamp": "2026-01-01T00:00:00+00:00",
+        }
+
+        def strip_perf(text: str) -> str:
+            return "\n".join(
+                line
+                for line in text.splitlines()
+                if "latency" not in line.lower() and "throughput" not in line.lower()
+            )
+
+        result = reporter.build_eval_result(
+            entry=self._make_entry(),
+            perf_proc=perf_proc,
+            device="cpu",
+            eval_types_run=["perf"],
+            accuracy_result=None,
+            ep=None,
+            sanitize_fn=strip_perf,
+        )
+
+        perf = result["perf"]
+        # sanitized output should not contain latency/throughput lines
+        assert "Latency" not in perf["stdout_output"]
+        assert "Throughput" not in perf["stdout_output"]
+        # raw output preserves the original perf data
+        assert "Latency (ms): 12.5" in perf["raw_stdout"]
+        assert "Throughput: 80 samples/sec" in perf["raw_stdout"]
+        assert perf["raw_stderr"] == "warning: device busy"
+
 
 class TestDefaultDatasetImmutability:
     """Tests that module-level _DEFAULT_DATASETS are not corrupted."""