Skip to content

Commit 53f2359

Browse files
Extract Comparator class for smarter primer diff
Instead of reporting changed messages as separate "new" + "missing" entries, fuzzy-match residuals by identity fields (symbol, path, obj) and SequenceMatcher similarity to detect messages that merely moved lines or changed text. The PR comment now shows a "changed messages" section with compact diffs (location spans, diff code blocks).
1 parent 9a24747 commit 53f2359

23 files changed

Lines changed: 485 additions & 124 deletions
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
2+
# For details: https://github.com/pylint-dev/pylint/blob/main/LICENSE
3+
# Copyright (c) https://github.com/pylint-dev/pylint/blob/main/CONTRIBUTORS.txt
4+
5+
from __future__ import annotations
6+
7+
import json
8+
from collections import defaultdict
9+
from collections.abc import Iterator
10+
from difflib import SequenceMatcher
11+
from pathlib import Path
12+
13+
from pylint.reporters.json_reporter import OldJsonExport
14+
from pylint.testutils._primer.primer_command import PackageMessages
15+
16+
# Minimum SequenceMatcher ratio to consider two residual messages "the same
17+
# diagnostic". The identity fields (symbol, path, obj) already match, so a
18+
# generous threshold is fine here.
19+
_FUZZY_THRESHOLD = 0.5
20+
21+
ChangedMessage = tuple[OldJsonExport, OldJsonExport] # (old, new)
22+
23+
_LOCATION_KEYS = {"line", "column", "endLine", "endColumn"}
24+
25+
26+
def _match_key(msg: OldJsonExport) -> tuple[str, str, str]:
27+
return (msg["symbol"], msg["path"], msg["obj"])
28+
29+
30+
def _fuzzy_pair(
31+
old_msgs: list[OldJsonExport], new_msgs: list[OldJsonExport]
32+
) -> tuple[list[ChangedMessage], list[OldJsonExport], list[OldJsonExport]]:
33+
"""Pair residual messages by similarity.
34+
35+
Returns (paired, unmatched_old, unmatched_new).
36+
"""
37+
if not old_msgs or not new_msgs:
38+
return [], old_msgs, new_msgs
39+
40+
paired: list[ChangedMessage] = []
41+
used_old: set[int] = set()
42+
used_new: set[int] = set()
43+
44+
for i, old in enumerate(old_msgs):
45+
old_str = str(old)
46+
best_ratio = _FUZZY_THRESHOLD
47+
best_idx = -1
48+
for j, new in enumerate(new_msgs):
49+
if j in used_new:
50+
continue
51+
ratio = SequenceMatcher(None, old_str, str(new)).ratio()
52+
if ratio > best_ratio:
53+
best_ratio = ratio
54+
best_idx = j
55+
if best_idx >= 0:
56+
paired.append((old, new_msgs[best_idx]))
57+
used_old.add(i)
58+
used_new.add(best_idx)
59+
60+
unmatched_old = [m for i, m in enumerate(old_msgs) if i not in used_old]
61+
unmatched_new = [m for j, m in enumerate(new_msgs) if j not in used_new]
62+
return paired, unmatched_old, unmatched_new
63+
64+
65+
def _fuzzy_match_residuals(
66+
old_residuals: list[OldJsonExport], new_residuals: list[OldJsonExport]
67+
) -> tuple[list[ChangedMessage], list[OldJsonExport], list[OldJsonExport]]:
68+
"""Fuzzy-match residual messages by identity fields then similarity.
69+
70+
Returns (paired, unmatched_old, unmatched_new) with original order preserved.
71+
"""
72+
old_by_key: dict[tuple[str, str, str], list[OldJsonExport]] = defaultdict(list)
73+
new_by_key: dict[tuple[str, str, str], list[OldJsonExport]] = defaultdict(list)
74+
for m in old_residuals:
75+
old_by_key[_match_key(m)].append(m)
76+
for m in new_residuals:
77+
new_by_key[_match_key(m)].append(m)
78+
79+
paired: list[ChangedMessage] = []
80+
paired_old_ids: set[int] = set()
81+
paired_new_ids: set[int] = set()
82+
for key in old_by_key:
83+
if key not in new_by_key:
84+
continue
85+
p, _, _ = _fuzzy_pair(old_by_key[key], new_by_key[key])
86+
for old, new in p:
87+
paired.append((old, new))
88+
paired_old_ids.add(id(old))
89+
paired_new_ids.add(id(new))
90+
91+
final_missing = [m for m in old_residuals if id(m) not in paired_old_ids]
92+
final_new = [m for m in new_residuals if id(m) not in paired_new_ids]
93+
return paired, final_missing, final_new
94+
95+
96+
def format_span(msg: OldJsonExport) -> str:
97+
"""Format a message's location as ``line:col to endLine:endCol``."""
98+
start = f"{msg['line']}:{msg['column']}"
99+
end_line = msg.get("endLine")
100+
end_col = msg.get("endColumn")
101+
if end_line is not None and end_col is not None:
102+
return f"{start} to {end_line}:{end_col}"
103+
return start
104+
105+
106+
def message_diff(old: OldJsonExport, new: OldJsonExport) -> str:
107+
"""Return a compact summary of changed fields between two messages.
108+
109+
Location changes are merged into a single human-readable span.
110+
String fields (like ``message``) get a ``diff`` code block so GitHub
111+
renders them with red/green highlighting.
112+
"""
113+
changed_keys: set[str] = set()
114+
for key in old:
115+
if old[key] != new[key]: # type: ignore[literal-required]
116+
changed_keys.add(key)
117+
118+
parts: list[str] = []
119+
# Location: combine line/column/endLine/endColumn into one sentence.
120+
if changed_keys & _LOCATION_KEYS:
121+
parts.append(f"Was raised on {format_span(old)}, now on {format_span(new)}.")
122+
123+
# Other fields (typically ``message``).
124+
for key in sorted(changed_keys - _LOCATION_KEYS):
125+
old_val = old[key] # type: ignore[literal-required]
126+
new_val = new[key] # type: ignore[literal-required]
127+
if isinstance(old_val, str) and isinstance(new_val, str):
128+
parts.append(f"The {key} changed:\n```diff\n- {old_val}\n+ {new_val}\n```")
129+
else:
130+
parts.append(f"{key}: {old_val!r} -> {new_val!r}")
131+
return "\n".join(parts)
132+
133+
134+
class Comparator:
135+
"""Cross-reference two primer JSON outputs and iterate over differences.
136+
137+
Yields ``(package, missing, new, changed)`` for each package that has at
138+
least one difference. *changed* contains pairs of ``(old, new)`` messages
139+
that are the same diagnostic but with altered details (line, message text,
140+
etc.).
141+
"""
142+
143+
def __init__(
144+
self, main_data: PackageMessages, pr_data: PackageMessages
145+
) -> None:
146+
self.missing_messages: dict[str, list[OldJsonExport]] = {}
147+
self.new_messages: dict[str, list[OldJsonExport]] = {}
148+
self.changed_messages: dict[str, list[ChangedMessage]] = {}
149+
self.commits: dict[str, str] = {}
150+
151+
for package, data in main_data.items():
152+
self.commits[package] = pr_data[package]["commit"]
153+
# First pass: exact-match removal (same as before).
154+
residual_old: list[OldJsonExport] = []
155+
for message in data["messages"]:
156+
try:
157+
pr_data[package]["messages"].remove(message)
158+
except ValueError:
159+
residual_old.append(message)
160+
161+
# Second pass: fuzzy-match residuals to detect *changed* messages.
162+
paired, final_missing, final_new = _fuzzy_match_residuals(
163+
residual_old, pr_data[package]["messages"]
164+
)
165+
166+
self.missing_messages[package] = final_missing
167+
self.new_messages[package] = final_new
168+
self.changed_messages[package] = paired
169+
170+
def __iter__(
171+
self,
172+
) -> Iterator[
173+
tuple[
174+
str,
175+
list[OldJsonExport],
176+
list[OldJsonExport],
177+
list[ChangedMessage],
178+
]
179+
]:
180+
for package in self.missing_messages:
181+
missing = self.missing_messages[package]
182+
new = self.new_messages[package]
183+
changed = self.changed_messages[package]
184+
if not missing and not new and not changed:
185+
continue
186+
yield package, missing, new, changed
187+
188+
@staticmethod
189+
def from_json(
190+
base_file: str, new_file: str, batches: int | None = None
191+
) -> Comparator:
192+
"""Build a Comparator from JSON file paths, handling batched runs."""
193+
if batches is None:
194+
main_data = _load_json(base_file)
195+
pr_data = _load_json(new_file)
196+
else:
197+
main_data: PackageMessages = {}
198+
pr_data: PackageMessages = {}
199+
for idx in range(batches):
200+
main_data.update(
201+
_load_json(base_file.replace("BATCHIDX", "batch" + str(idx)))
202+
)
203+
pr_data.update(
204+
_load_json(new_file.replace("BATCHIDX", "batch" + str(idx)))
205+
)
206+
return Comparator(main_data, pr_data)
207+
208+
209+
def _load_json(file_path: Path | str) -> PackageMessages:
210+
with open(file_path, encoding="utf-8") as f:
211+
result: PackageMessages = json.load(f)
212+
return result

0 commit comments

Comments
 (0)