Skip to content

Commit d1083e2

Browse files
Replace fuzzy matching with position-based pairing in primer diff
Match residual messages by (symbol, path, obj) key and pair them 1:1 instead of using SequenceMatcher similarity scoring. This catches the same "message was slightly changed" cases (line moved, text changed) with simpler, more predictable logic and no difflib dependency. Also clean up <details> block formatting (no extra blank lines).
1 parent 53f2359 commit d1083e2

9 files changed

Lines changed: 39 additions & 78 deletions

File tree

pylint/testutils/_primer/comparator.py

Lines changed: 25 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -7,83 +7,50 @@
77
import json
88
from collections import defaultdict
99
from collections.abc import Iterator
10-
from difflib import SequenceMatcher
1110
from pathlib import Path
1211

1312
from pylint.reporters.json_reporter import OldJsonExport
1413
from pylint.testutils._primer.primer_command import PackageMessages
1514

16-
# Minimum SequenceMatcher ratio to consider two residual messages "the same
17-
# diagnostic". The identity fields (symbol, path, obj) already match, so a
18-
# generous threshold is fine here.
19-
_FUZZY_THRESHOLD = 0.5
20-
2115
ChangedMessage = tuple[OldJsonExport, OldJsonExport] # (old, new)
2216

2317
_LOCATION_KEYS = {"line", "column", "endLine", "endColumn"}
2418

2519

26-
def _match_key(msg: OldJsonExport) -> tuple[str, str, str]:
27-
return (msg["symbol"], msg["path"], msg["obj"])
28-
29-
30-
def _fuzzy_pair(
31-
old_msgs: list[OldJsonExport], new_msgs: list[OldJsonExport]
32-
) -> tuple[list[ChangedMessage], list[OldJsonExport], list[OldJsonExport]]:
33-
"""Pair residual messages by similarity.
20+
def _position_key(msg: OldJsonExport) -> tuple[str, str, str]:
21+
"""Key that identifies a diagnostic independently of its text or location.
3422
35-
Returns (paired, unmatched_old, unmatched_new).
23+
Two messages that share (symbol, path, obj) are the "same diagnostic" — if
24+
they differ only in line numbers or message text, they should be reported as
25+
*changed* rather than as a separate removal + addition.
3626
"""
37-
if not old_msgs or not new_msgs:
38-
return [], old_msgs, new_msgs
39-
40-
paired: list[ChangedMessage] = []
41-
used_old: set[int] = set()
42-
used_new: set[int] = set()
43-
44-
for i, old in enumerate(old_msgs):
45-
old_str = str(old)
46-
best_ratio = _FUZZY_THRESHOLD
47-
best_idx = -1
48-
for j, new in enumerate(new_msgs):
49-
if j in used_new:
50-
continue
51-
ratio = SequenceMatcher(None, old_str, str(new)).ratio()
52-
if ratio > best_ratio:
53-
best_ratio = ratio
54-
best_idx = j
55-
if best_idx >= 0:
56-
paired.append((old, new_msgs[best_idx]))
57-
used_old.add(i)
58-
used_new.add(best_idx)
59-
60-
unmatched_old = [m for i, m in enumerate(old_msgs) if i not in used_old]
61-
unmatched_new = [m for j, m in enumerate(new_msgs) if j not in used_new]
62-
return paired, unmatched_old, unmatched_new
27+
return (msg["symbol"], msg["path"], msg["obj"])
6328

6429

65-
def _fuzzy_match_residuals(
30+
def _match_residuals(
6631
old_residuals: list[OldJsonExport], new_residuals: list[OldJsonExport]
6732
) -> tuple[list[ChangedMessage], list[OldJsonExport], list[OldJsonExport]]:
68-
"""Fuzzy-match residual messages by identity fields then similarity.
33+
"""Pair residual messages by position key ``(symbol, path, obj)``.
34+
35+
Messages that share the same key are paired 1:1 in order. Any left-over
36+
messages remain as genuinely missing or genuinely new.
6937
70-
Returns (paired, unmatched_old, unmatched_new) with original order preserved.
38+
Returns ``(paired, unmatched_old, unmatched_new)``.
7139
"""
7240
old_by_key: dict[tuple[str, str, str], list[OldJsonExport]] = defaultdict(list)
7341
new_by_key: dict[tuple[str, str, str], list[OldJsonExport]] = defaultdict(list)
7442
for m in old_residuals:
75-
old_by_key[_match_key(m)].append(m)
43+
old_by_key[_position_key(m)].append(m)
7644
for m in new_residuals:
77-
new_by_key[_match_key(m)].append(m)
45+
new_by_key[_position_key(m)].append(m)
7846

7947
paired: list[ChangedMessage] = []
8048
paired_old_ids: set[int] = set()
8149
paired_new_ids: set[int] = set()
8250
for key in old_by_key:
8351
if key not in new_by_key:
8452
continue
85-
p, _, _ = _fuzzy_pair(old_by_key[key], new_by_key[key])
86-
for old, new in p:
53+
for old, new in zip(old_by_key[key], new_by_key[key]):
8754
paired.append((old, new))
8855
paired_old_ids.add(id(old))
8956
paired_new_ids.add(id(new))
@@ -140,26 +107,25 @@ class Comparator:
140107
etc.).
141108
"""
142109

143-
def __init__(
144-
self, main_data: PackageMessages, pr_data: PackageMessages
145-
) -> None:
110+
def __init__(self, main_data: PackageMessages, pr_data: PackageMessages) -> None:
146111
self.missing_messages: dict[str, list[OldJsonExport]] = {}
147112
self.new_messages: dict[str, list[OldJsonExport]] = {}
148113
self.changed_messages: dict[str, list[ChangedMessage]] = {}
149114
self.commits: dict[str, str] = {}
150115

151116
for package, data in main_data.items():
152117
self.commits[package] = pr_data[package]["commit"]
153-
# First pass: exact-match removal (same as before).
118+
# First pass: exact-match removal.
154119
residual_old: list[OldJsonExport] = []
155120
for message in data["messages"]:
156121
try:
157122
pr_data[package]["messages"].remove(message)
158123
except ValueError:
159124
residual_old.append(message)
160125

161-
# Second pass: fuzzy-match residuals to detect *changed* messages.
162-
paired, final_missing, final_new = _fuzzy_match_residuals(
126+
# Second pass: pair residuals by position to detect *changed*
127+
# messages (same diagnostic, different line or text).
128+
paired, final_missing, final_new = _match_residuals(
163129
residual_old, pr_data[package]["messages"]
164130
)
165131

@@ -177,8 +143,7 @@ def __iter__(
177143
list[ChangedMessage],
178144
]
179145
]:
180-
for package in self.missing_messages:
181-
missing = self.missing_messages[package]
146+
for package, missing in self.missing_messages.items():
182147
new = self.new_messages[package]
183148
changed = self.changed_messages[package]
184149
if not missing and not new and not changed:
@@ -190,12 +155,14 @@ def from_json(
190155
base_file: str, new_file: str, batches: int | None = None
191156
) -> Comparator:
192157
"""Build a Comparator from JSON file paths, handling batched runs."""
158+
main_data: PackageMessages
159+
pr_data: PackageMessages
193160
if batches is None:
194161
main_data = _load_json(base_file)
195162
pr_data = _load_json(new_file)
196163
else:
197-
main_data: PackageMessages = {}
198-
pr_data: PackageMessages = {}
164+
main_data = {}
165+
pr_data = {}
199166
for idx in range(batches):
200167
main_data.update(
201168
_load_json(base_file.replace("BATCHIDX", "batch" + str(idx)))

pylint/testutils/_primer/primer_compare_command.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,18 +66,18 @@ def _source_link(msg: OldJsonExport) -> str:
6666
filepath = str(PurePosixPath(msg["path"]).relative_to(clone_dir))
6767
return f"{url}/blob/{commit}/{filepath}#L{msg['line']}"
6868

69-
comment = f"\n**Effect on [{package}]({url}):**\n"
69+
comment = f"\n**Effect on [{package}]({url}):**\n\n"
7070

7171
# -- Changed messages (same diagnostic, different details) -----------
7272
if changed_messages:
73-
comment += "The following messages have been changed:\n\n<details>\n\n"
73+
comment += "The following messages have been changed:\n\n<details>\n"
7474
for count, (old, new) in enumerate(changed_messages, 1):
7575
comment += (
7676
f"{count}) {new['symbol']}:\n"
7777
f"{message_diff(old, new)}\n"
7878
f"{_source_link(new)}\n"
7979
)
80-
comment += "\n</details>\n\n"
80+
comment += "</details>\n\n"
8181

8282
# -- New messages ----------------------------------------------------
8383
astroid_errors = 0
@@ -102,20 +102,20 @@ def _source_link(msg: OldJsonExport) -> str:
102102
)
103103
if new_non_astroid_messages:
104104
comment += (
105-
"The following messages are now emitted:\n\n<details>\n\n"
105+
"The following messages are now emitted:\n\n<details>\n"
106106
+ new_non_astroid_messages
107-
+ "\n</details>\n\n"
107+
+ "</details>\n\n"
108108
)
109109

110110
# -- Missing messages ------------------------------------------------
111111
if missing_messages:
112-
comment += "The following messages are no longer emitted:\n\n<details>\n\n"
112+
comment += "The following messages are no longer emitted:\n\n<details>\n"
113113
for count, message in enumerate(missing_messages, 1):
114114
comment += (
115115
f"{count}) {message['symbol']}:\n*{message['message']}*\n"
116116
f"{_source_link(message)}\n"
117117
)
118-
comment += "\n</details>\n\n"
118+
comment += "</details>\n\n"
119119

120120
return comment
121121

tests/testutils/_primer/fixtures/batched/expected.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,16 @@
22

33

44
**Effect on [astroid](https://github.com/pylint-dev/astroid):**
5+
56
The following messages are no longer emitted:
67

78
<details>
8-
99
1) locally-disabled:
1010
*Locally disabling redefined-builtin (W0622)*
1111
https://github.com/pylint-dev/astroid/blob/1234567890abcdef/astroid/__init__.py#L91
1212
2) unknown-option-value:
1313
*Unknown option value for 'disable', expected a valid pylint message and got 'Ellipsis'*
1414
https://github.com/pylint-dev/astroid/blob/1234567890abcdef/astroid/__init__.py#L91
15-
1615
</details>
1716

1817
*This comment was generated for commit v2.14.2*

tests/testutils/_primer/fixtures/line_moved/expected.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@
22

33

44
**Effect on [astroid](https://github.com/pylint-dev/astroid):**
5+
56
The following messages have been changed:
67

78
<details>
8-
99
1) too-complex:
1010
Was raised on 100:0 to 100:15, now on 103:0 to 103:15.
1111
https://github.com/pylint-dev/astroid/blob/123456789abcdef/astroid/node_classes.py#L103
12-
1312
</details>
1413

1514
*This comment was generated for commit v2.14.2*

tests/testutils/_primer/fixtures/message_changed/expected.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,17 @@
22

33

44
**Effect on [astroid](https://github.com/pylint-dev/astroid):**
5+
56
The following messages have been changed:
67

78
<details>
8-
99
1) locally-disabled:
1010
The message changed:
1111
```diff
1212
- Locally disabling redefined-builtin (W0622)
1313
+ Locally disabling redefined-builtin [we added some text in the message] (W0622)
1414
```
1515
https://github.com/pylint-dev/astroid/blob/123456789abcdef/astroid/__init__.py#L91
16-
1716
</details>
1817

1918
*This comment was generated for commit v2.14.2*

tests/testutils/_primer/fixtures/message_changed/expected_truncated.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33

44
**Effect on [astroid](https://github.com/pylint-dev/astroid):**
5+
56
The following messages have been changed:
67

78
<details>
8-
99
1) locally-disabled:
1010
The message changed:
1111
```diff

tests/testutils/_primer/fixtures/message_changed_line/expected.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33

44
**Effect on [astroid](https://github.com/pylint-dev/astroid):**
5+
56
The following messages have been changed:
67

78
<details>
8-
99
1) too-complex:
1010
Was raised on 100:0 to 100:15, now on 103:0 to 103:15.
1111
The message changed:
@@ -14,7 +14,6 @@ The message changed:
1414
+ 'my_function' is too complex. The McCabe rating is 17
1515
```
1616
https://github.com/pylint-dev/astroid/blob/123456789abcdef/astroid/node_classes.py#L103
17-
1817
</details>
1918

2019
*This comment was generated for commit v2.14.2*

tests/testutils/_primer/fixtures/new_message/expected.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@
22

33

44
**Effect on [astroid](https://github.com/pylint-dev/astroid):**
5+
56
The following messages are now emitted:
67

78
<details>
8-
99
1) locally-disabled:
1010
*Locally disabling redefined-builtin [we added some text in the message] (W0622)*
1111
https://github.com/pylint-dev/astroid/blob/123456789abcdef/astroid/__init__.py#L91
12-
1312
</details>
1413

1514
*This comment was generated for commit v2.14.2*

tests/testutils/_primer/fixtures/removed_message/expected.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@
22

33

44
**Effect on [astroid](https://github.com/pylint-dev/astroid):**
5+
56
The following messages are no longer emitted:
67

78
<details>
8-
99
1) unknown-option-value:
1010
*Unknown option value for 'disable', expected a valid pylint message and got 'Ellipsis'*
1111
https://github.com/pylint-dev/astroid/blob/123456789abcdef/astroid/__init__.py#L91
12-
1312
</details>
1413

1514
*This comment was generated for commit v2.14.2*

0 commit comments

Comments
 (0)