77import json
88from collections import defaultdict
99from collections .abc import Iterator
10- from difflib import SequenceMatcher
1110from pathlib import Path
1211
1312from pylint .reporters .json_reporter import OldJsonExport
1413from pylint .testutils ._primer .primer_command import PackageMessages
1514
16- # Minimum SequenceMatcher ratio to consider two residual messages "the same
17- # diagnostic". The identity fields (symbol, path, obj) already match, so a
18- # generous threshold is fine here.
19- _FUZZY_THRESHOLD = 0.5
20-
2115ChangedMessage = tuple [OldJsonExport , OldJsonExport ] # (old, new)
2216
2317_LOCATION_KEYS = {"line" , "column" , "endLine" , "endColumn" }
2418
2519
26- def _match_key (msg : OldJsonExport ) -> tuple [str , str , str ]:
27- return (msg ["symbol" ], msg ["path" ], msg ["obj" ])
28-
29-
30- def _fuzzy_pair (
31- old_msgs : list [OldJsonExport ], new_msgs : list [OldJsonExport ]
32- ) -> tuple [list [ChangedMessage ], list [OldJsonExport ], list [OldJsonExport ]]:
33- """Pair residual messages by similarity.
20+ def _position_key (msg : OldJsonExport ) -> tuple [str , str , str ]:
21+ """Key that identifies a diagnostic independently of its text or location.
3422
35- Returns (paired, unmatched_old, unmatched_new).
23+ Two messages that share (symbol, path, obj) are the "same diagnostic" — if
24+ they differ only in line numbers or message text, they should be reported as
25+ *changed* rather than as a separate removal + addition.
3626 """
37- if not old_msgs or not new_msgs :
38- return [], old_msgs , new_msgs
39-
40- paired : list [ChangedMessage ] = []
41- used_old : set [int ] = set ()
42- used_new : set [int ] = set ()
43-
44- for i , old in enumerate (old_msgs ):
45- old_str = str (old )
46- best_ratio = _FUZZY_THRESHOLD
47- best_idx = - 1
48- for j , new in enumerate (new_msgs ):
49- if j in used_new :
50- continue
51- ratio = SequenceMatcher (None , old_str , str (new )).ratio ()
52- if ratio > best_ratio :
53- best_ratio = ratio
54- best_idx = j
55- if best_idx >= 0 :
56- paired .append ((old , new_msgs [best_idx ]))
57- used_old .add (i )
58- used_new .add (best_idx )
59-
60- unmatched_old = [m for i , m in enumerate (old_msgs ) if i not in used_old ]
61- unmatched_new = [m for j , m in enumerate (new_msgs ) if j not in used_new ]
62- return paired , unmatched_old , unmatched_new
27+ return (msg ["symbol" ], msg ["path" ], msg ["obj" ])
6328
6429
65- def _fuzzy_match_residuals (
30+ def _match_residuals (
6631 old_residuals : list [OldJsonExport ], new_residuals : list [OldJsonExport ]
6732) -> tuple [list [ChangedMessage ], list [OldJsonExport ], list [OldJsonExport ]]:
68- """Fuzzy-match residual messages by identity fields then similarity.
33+ """Pair residual messages by position key ``(symbol, path, obj)``.
34+
35+ Messages that share the same key are paired 1:1 in order. Any left-over
36+ messages remain as genuinely missing or genuinely new.
6937
70- Returns (paired, unmatched_old, unmatched_new) with original order preserved .
38+ Returns `` (paired, unmatched_old, unmatched_new)`` .
7139 """
7240 old_by_key : dict [tuple [str , str , str ], list [OldJsonExport ]] = defaultdict (list )
7341 new_by_key : dict [tuple [str , str , str ], list [OldJsonExport ]] = defaultdict (list )
7442 for m in old_residuals :
75- old_by_key [_match_key (m )].append (m )
43+ old_by_key [_position_key (m )].append (m )
7644 for m in new_residuals :
77- new_by_key [_match_key (m )].append (m )
45+ new_by_key [_position_key (m )].append (m )
7846
7947 paired : list [ChangedMessage ] = []
8048 paired_old_ids : set [int ] = set ()
8149 paired_new_ids : set [int ] = set ()
8250 for key in old_by_key :
8351 if key not in new_by_key :
8452 continue
85- p , _ , _ = _fuzzy_pair (old_by_key [key ], new_by_key [key ])
86- for old , new in p :
53+ for old , new in zip (old_by_key [key ], new_by_key [key ]):
8754 paired .append ((old , new ))
8855 paired_old_ids .add (id (old ))
8956 paired_new_ids .add (id (new ))
@@ -140,26 +107,25 @@ class Comparator:
140107 etc.).
141108 """
142109
143- def __init__ (
144- self , main_data : PackageMessages , pr_data : PackageMessages
145- ) -> None :
110+ def __init__ (self , main_data : PackageMessages , pr_data : PackageMessages ) -> None :
146111 self .missing_messages : dict [str , list [OldJsonExport ]] = {}
147112 self .new_messages : dict [str , list [OldJsonExport ]] = {}
148113 self .changed_messages : dict [str , list [ChangedMessage ]] = {}
149114 self .commits : dict [str , str ] = {}
150115
151116 for package , data in main_data .items ():
152117 self .commits [package ] = pr_data [package ]["commit" ]
153- # First pass: exact-match removal (same as before) .
118+ # First pass: exact-match removal.
154119 residual_old : list [OldJsonExport ] = []
155120 for message in data ["messages" ]:
156121 try :
157122 pr_data [package ]["messages" ].remove (message )
158123 except ValueError :
159124 residual_old .append (message )
160125
161- # Second pass: fuzzy-match residuals to detect *changed* messages.
162- paired , final_missing , final_new = _fuzzy_match_residuals (
126+ # Second pass: pair residuals by position to detect *changed*
127+ # messages (same diagnostic, different line or text).
128+ paired , final_missing , final_new = _match_residuals (
163129 residual_old , pr_data [package ]["messages" ]
164130 )
165131
@@ -177,8 +143,7 @@ def __iter__(
177143 list [ChangedMessage ],
178144 ]
179145 ]:
180- for package in self .missing_messages :
181- missing = self .missing_messages [package ]
146+ for package , missing in self .missing_messages .items ():
182147 new = self .new_messages [package ]
183148 changed = self .changed_messages [package ]
184149 if not missing and not new and not changed :
@@ -190,12 +155,14 @@ def from_json(
190155 base_file : str , new_file : str , batches : int | None = None
191156 ) -> Comparator :
192157 """Build a Comparator from JSON file paths, handling batched runs."""
158+ main_data : PackageMessages
159+ pr_data : PackageMessages
193160 if batches is None :
194161 main_data = _load_json (base_file )
195162 pr_data = _load_json (new_file )
196163 else :
197- main_data : PackageMessages = {}
198- pr_data : PackageMessages = {}
164+ main_data = {}
165+ pr_data = {}
199166 for idx in range (batches ):
200167 main_data .update (
201168 _load_json (base_file .replace ("BATCHIDX" , "batch" + str (idx )))
0 commit comments