pylint-dev
diff --git a/‎pylint/testutils/_primer/comparator.py‎
Lines changed: 212 additions & 0 deletions b/‎pylint/testutils/_primer/comparator.py‎
Lines changed: 212 additions & 0 deletions
@@ -0,0 +1,212 @@
+# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
+# For details: https://github.com/pylint-dev/pylint/blob/main/LICENSE
+# Copyright (c) https://github.com/pylint-dev/pylint/blob/main/CONTRIBUTORS.txt
+
+from __future__ import annotations
+
+import json
+from collections import defaultdict
+from collections.abc import Iterator
+from difflib import SequenceMatcher
+from pathlib import Path
+
+from pylint.reporters.json_reporter import OldJsonExport
+from pylint.testutils._primer.primer_command import PackageMessages
+
+# Minimum SequenceMatcher ratio to consider two residual messages "the same
+# diagnostic".  The identity fields (symbol, path, obj) already match, so a
+# generous threshold is fine here.
+_FUZZY_THRESHOLD = 0.5
+
+ChangedMessage = tuple[OldJsonExport, OldJsonExport]  # (old, new)
+
+_LOCATION_KEYS = {"line", "column", "endLine", "endColumn"}
+
+
+def _match_key(msg: OldJsonExport) -> tuple[str, str, str]:
+    return (msg["symbol"], msg["path"], msg["obj"])
+
+
+def _fuzzy_pair(
+    old_msgs: list[OldJsonExport], new_msgs: list[OldJsonExport]
+) -> tuple[list[ChangedMessage], list[OldJsonExport], list[OldJsonExport]]:
+    """Pair residual messages by similarity.
+
+    Returns (paired, unmatched_old, unmatched_new).
+    """
+    if not old_msgs or not new_msgs:
+        return [], old_msgs, new_msgs
+
+    paired: list[ChangedMessage] = []
+    used_old: set[int] = set()
+    used_new: set[int] = set()
+
+    for i, old in enumerate(old_msgs):
+        old_str = str(old)
+        best_ratio = _FUZZY_THRESHOLD
+        best_idx = -1
+        for j, new in enumerate(new_msgs):
+            if j in used_new:
+                continue
+            ratio = SequenceMatcher(None, old_str, str(new)).ratio()
+            if ratio > best_ratio:
+                best_ratio = ratio
+                best_idx = j
+        if best_idx >= 0:
+            paired.append((old, new_msgs[best_idx]))
+            used_old.add(i)
+            used_new.add(best_idx)
+
+    unmatched_old = [m for i, m in enumerate(old_msgs) if i not in used_old]
+    unmatched_new = [m for j, m in enumerate(new_msgs) if j not in used_new]
+    return paired, unmatched_old, unmatched_new
+
+
+def _fuzzy_match_residuals(
+    old_residuals: list[OldJsonExport], new_residuals: list[OldJsonExport]
+) -> tuple[list[ChangedMessage], list[OldJsonExport], list[OldJsonExport]]:
+    """Fuzzy-match residual messages by identity fields then similarity.
+
+    Returns (paired, unmatched_old, unmatched_new) with original order preserved.
+    """
+    old_by_key: dict[tuple[str, str, str], list[OldJsonExport]] = defaultdict(list)
+    new_by_key: dict[tuple[str, str, str], list[OldJsonExport]] = defaultdict(list)
+    for m in old_residuals:
+        old_by_key[_match_key(m)].append(m)
+    for m in new_residuals:
+        new_by_key[_match_key(m)].append(m)
+
+    paired: list[ChangedMessage] = []
+    paired_old_ids: set[int] = set()
+    paired_new_ids: set[int] = set()
+    for key in old_by_key:
+        if key not in new_by_key:
+            continue
+        p, _, _ = _fuzzy_pair(old_by_key[key], new_by_key[key])
+        for old, new in p:
+            paired.append((old, new))
+            paired_old_ids.add(id(old))
+            paired_new_ids.add(id(new))
+
+    final_missing = [m for m in old_residuals if id(m) not in paired_old_ids]
+    final_new = [m for m in new_residuals if id(m) not in paired_new_ids]
+    return paired, final_missing, final_new
+
+
+def format_span(msg: OldJsonExport) -> str:
+    """Format a message's location as ``line:col to endLine:endCol``."""
+    start = f"{msg['line']}:{msg['column']}"
+    end_line = msg.get("endLine")
+    end_col = msg.get("endColumn")
+    if end_line is not None and end_col is not None:
+        return f"{start} to {end_line}:{end_col}"
+    return start
+
+
+def message_diff(old: OldJsonExport, new: OldJsonExport) -> str:
+    """Return a compact summary of changed fields between two messages.
+
+    Location changes are merged into a single human-readable span.
+    String fields (like ``message``) get a ``diff`` code block so GitHub
+    renders them with red/green highlighting.
+    """
+    changed_keys: set[str] = set()
+    for key in old:
+        if old[key] != new[key]:  # type: ignore[literal-required]
+            changed_keys.add(key)
+
+    parts: list[str] = []
+    # Location: combine line/column/endLine/endColumn into one sentence.
+    if changed_keys & _LOCATION_KEYS:
+        parts.append(f"Was raised on {format_span(old)}, now on {format_span(new)}.")
+
+    # Other fields (typically ``message``).
+    for key in sorted(changed_keys - _LOCATION_KEYS):
+        old_val = old[key]  # type: ignore[literal-required]
+        new_val = new[key]  # type: ignore[literal-required]
+        if isinstance(old_val, str) and isinstance(new_val, str):
+            parts.append(f"The {key} changed:\n```diff\n- {old_val}\n+ {new_val}\n```")
+        else:
+            parts.append(f"{key}: {old_val!r} -> {new_val!r}")
+    return "\n".join(parts)
+
+
+class Comparator:
+    """Cross-reference two primer JSON outputs and iterate over differences.
+
+    Yields ``(package, missing, new, changed)`` for each package that has at
+    least one difference.  *changed* contains pairs of ``(old, new)`` messages
+    that are the same diagnostic but with altered details (line, message text,
+    etc.).
+    """
+
+    def __init__(
+        self, main_data: PackageMessages, pr_data: PackageMessages
+    ) -> None:
+        self.missing_messages: dict[str, list[OldJsonExport]] = {}
+        self.new_messages: dict[str, list[OldJsonExport]] = {}
+        self.changed_messages: dict[str, list[ChangedMessage]] = {}
+        self.commits: dict[str, str] = {}
+
+        for package, data in main_data.items():
+            self.commits[package] = pr_data[package]["commit"]
+            # First pass: exact-match removal (same as before).
+            residual_old: list[OldJsonExport] = []
+            for message in data["messages"]:
+                try:
+                    pr_data[package]["messages"].remove(message)
+                except ValueError:
+                    residual_old.append(message)
+
+            # Second pass: fuzzy-match residuals to detect *changed* messages.
+            paired, final_missing, final_new = _fuzzy_match_residuals(
+                residual_old, pr_data[package]["messages"]
+            )
+
+            self.missing_messages[package] = final_missing
+            self.new_messages[package] = final_new
+            self.changed_messages[package] = paired
+
+    def __iter__(
+        self,
+    ) -> Iterator[
+        tuple[
+            str,
+            list[OldJsonExport],
+            list[OldJsonExport],
+            list[ChangedMessage],
+        ]
+    ]:
+        for package in self.missing_messages:
+            missing = self.missing_messages[package]
+            new = self.new_messages[package]
+            changed = self.changed_messages[package]
+            if not missing and not new and not changed:
+                continue
+            yield package, missing, new, changed
+
+    @staticmethod
+    def from_json(
+        base_file: str, new_file: str, batches: int | None = None
+    ) -> Comparator:
+        """Build a Comparator from JSON file paths, handling batched runs."""
+        if batches is None:
+            main_data = _load_json(base_file)
+            pr_data = _load_json(new_file)
+        else:
+            main_data: PackageMessages = {}
+            pr_data: PackageMessages = {}
+            for idx in range(batches):
+                main_data.update(
+                    _load_json(base_file.replace("BATCHIDX", "batch" + str(idx)))
+                )
+                pr_data.update(
+                    _load_json(new_file.replace("BATCHIDX", "batch" + str(idx)))
+                )
+        return Comparator(main_data, pr_data)
+
+
+def _load_json(file_path: Path | str) -> PackageMessages:
+    with open(file_path, encoding="utf-8") as f:
+        result: PackageMessages = json.load(f)
+    return result