|
| 1 | +# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html |
| 2 | +# For details: https://github.com/pylint-dev/pylint/blob/main/LICENSE |
| 3 | +# Copyright (c) https://github.com/pylint-dev/pylint/blob/main/CONTRIBUTORS.txt |
| 4 | + |
| 5 | +from __future__ import annotations |
| 6 | + |
| 7 | +import json |
| 8 | +from collections import defaultdict |
| 9 | +from collections.abc import Iterator |
| 10 | +from difflib import SequenceMatcher |
| 11 | +from pathlib import Path |
| 12 | + |
| 13 | +from pylint.reporters.json_reporter import OldJsonExport |
| 14 | +from pylint.testutils._primer.primer_command import PackageMessages |
| 15 | + |
| 16 | +# Minimum SequenceMatcher ratio to consider two residual messages "the same |
| 17 | +# diagnostic". The identity fields (symbol, path, obj) already match, so a |
| 18 | +# generous threshold is fine here. |
| 19 | +_FUZZY_THRESHOLD = 0.5 |
| 20 | + |
| 21 | +ChangedMessage = tuple[OldJsonExport, OldJsonExport] # (old, new) |
| 22 | + |
| 23 | +_LOCATION_KEYS = {"line", "column", "endLine", "endColumn"} |
| 24 | + |
| 25 | + |
| 26 | +def _match_key(msg: OldJsonExport) -> tuple[str, str, str]: |
| 27 | + return (msg["symbol"], msg["path"], msg["obj"]) |
| 28 | + |
| 29 | + |
| 30 | +def _fuzzy_pair( |
| 31 | + old_msgs: list[OldJsonExport], new_msgs: list[OldJsonExport] |
| 32 | +) -> tuple[list[ChangedMessage], list[OldJsonExport], list[OldJsonExport]]: |
| 33 | + """Pair residual messages by similarity. |
| 34 | +
|
| 35 | + Returns (paired, unmatched_old, unmatched_new). |
| 36 | + """ |
| 37 | + if not old_msgs or not new_msgs: |
| 38 | + return [], old_msgs, new_msgs |
| 39 | + |
| 40 | + paired: list[ChangedMessage] = [] |
| 41 | + used_old: set[int] = set() |
| 42 | + used_new: set[int] = set() |
| 43 | + |
| 44 | + for i, old in enumerate(old_msgs): |
| 45 | + old_str = str(old) |
| 46 | + best_ratio = _FUZZY_THRESHOLD |
| 47 | + best_idx = -1 |
| 48 | + for j, new in enumerate(new_msgs): |
| 49 | + if j in used_new: |
| 50 | + continue |
| 51 | + ratio = SequenceMatcher(None, old_str, str(new)).ratio() |
| 52 | + if ratio > best_ratio: |
| 53 | + best_ratio = ratio |
| 54 | + best_idx = j |
| 55 | + if best_idx >= 0: |
| 56 | + paired.append((old, new_msgs[best_idx])) |
| 57 | + used_old.add(i) |
| 58 | + used_new.add(best_idx) |
| 59 | + |
| 60 | + unmatched_old = [m for i, m in enumerate(old_msgs) if i not in used_old] |
| 61 | + unmatched_new = [m for j, m in enumerate(new_msgs) if j not in used_new] |
| 62 | + return paired, unmatched_old, unmatched_new |
| 63 | + |
| 64 | + |
| 65 | +def _fuzzy_match_residuals( |
| 66 | + old_residuals: list[OldJsonExport], new_residuals: list[OldJsonExport] |
| 67 | +) -> tuple[list[ChangedMessage], list[OldJsonExport], list[OldJsonExport]]: |
| 68 | + """Fuzzy-match residual messages by identity fields then similarity. |
| 69 | +
|
| 70 | + Returns (paired, unmatched_old, unmatched_new) with original order preserved. |
| 71 | + """ |
| 72 | + old_by_key: dict[tuple[str, str, str], list[OldJsonExport]] = defaultdict(list) |
| 73 | + new_by_key: dict[tuple[str, str, str], list[OldJsonExport]] = defaultdict(list) |
| 74 | + for m in old_residuals: |
| 75 | + old_by_key[_match_key(m)].append(m) |
| 76 | + for m in new_residuals: |
| 77 | + new_by_key[_match_key(m)].append(m) |
| 78 | + |
| 79 | + paired: list[ChangedMessage] = [] |
| 80 | + paired_old_ids: set[int] = set() |
| 81 | + paired_new_ids: set[int] = set() |
| 82 | + for key in old_by_key: |
| 83 | + if key not in new_by_key: |
| 84 | + continue |
| 85 | + p, _, _ = _fuzzy_pair(old_by_key[key], new_by_key[key]) |
| 86 | + for old, new in p: |
| 87 | + paired.append((old, new)) |
| 88 | + paired_old_ids.add(id(old)) |
| 89 | + paired_new_ids.add(id(new)) |
| 90 | + |
| 91 | + final_missing = [m for m in old_residuals if id(m) not in paired_old_ids] |
| 92 | + final_new = [m for m in new_residuals if id(m) not in paired_new_ids] |
| 93 | + return paired, final_missing, final_new |
| 94 | + |
| 95 | + |
| 96 | +def format_span(msg: OldJsonExport) -> str: |
| 97 | + """Format a message's location as ``line:col to endLine:endCol``.""" |
| 98 | + start = f"{msg['line']}:{msg['column']}" |
| 99 | + end_line = msg.get("endLine") |
| 100 | + end_col = msg.get("endColumn") |
| 101 | + if end_line is not None and end_col is not None: |
| 102 | + return f"{start} to {end_line}:{end_col}" |
| 103 | + return start |
| 104 | + |
| 105 | + |
| 106 | +def message_diff(old: OldJsonExport, new: OldJsonExport) -> str: |
| 107 | + """Return a compact summary of changed fields between two messages. |
| 108 | +
|
| 109 | + Location changes are merged into a single human-readable span. |
| 110 | + String fields (like ``message``) get a ``diff`` code block so GitHub |
| 111 | + renders them with red/green highlighting. |
| 112 | + """ |
| 113 | + changed_keys: set[str] = set() |
| 114 | + for key in old: |
| 115 | + if old[key] != new[key]: # type: ignore[literal-required] |
| 116 | + changed_keys.add(key) |
| 117 | + |
| 118 | + parts: list[str] = [] |
| 119 | + # Location: combine line/column/endLine/endColumn into one sentence. |
| 120 | + if changed_keys & _LOCATION_KEYS: |
| 121 | + parts.append(f"Was raised on {format_span(old)}, now on {format_span(new)}.") |
| 122 | + |
| 123 | + # Other fields (typically ``message``). |
| 124 | + for key in sorted(changed_keys - _LOCATION_KEYS): |
| 125 | + old_val = old[key] # type: ignore[literal-required] |
| 126 | + new_val = new[key] # type: ignore[literal-required] |
| 127 | + if isinstance(old_val, str) and isinstance(new_val, str): |
| 128 | + parts.append(f"The {key} changed:\n```diff\n- {old_val}\n+ {new_val}\n```") |
| 129 | + else: |
| 130 | + parts.append(f"{key}: {old_val!r} -> {new_val!r}") |
| 131 | + return "\n".join(parts) |
| 132 | + |
| 133 | + |
| 134 | +class Comparator: |
| 135 | + """Cross-reference two primer JSON outputs and iterate over differences. |
| 136 | +
|
| 137 | + Yields ``(package, missing, new, changed)`` for each package that has at |
| 138 | + least one difference. *changed* contains pairs of ``(old, new)`` messages |
| 139 | + that are the same diagnostic but with altered details (line, message text, |
| 140 | + etc.). |
| 141 | + """ |
| 142 | + |
| 143 | + def __init__( |
| 144 | + self, main_data: PackageMessages, pr_data: PackageMessages |
| 145 | + ) -> None: |
| 146 | + self.missing_messages: dict[str, list[OldJsonExport]] = {} |
| 147 | + self.new_messages: dict[str, list[OldJsonExport]] = {} |
| 148 | + self.changed_messages: dict[str, list[ChangedMessage]] = {} |
| 149 | + self.commits: dict[str, str] = {} |
| 150 | + |
| 151 | + for package, data in main_data.items(): |
| 152 | + self.commits[package] = pr_data[package]["commit"] |
| 153 | + # First pass: exact-match removal (same as before). |
| 154 | + residual_old: list[OldJsonExport] = [] |
| 155 | + for message in data["messages"]: |
| 156 | + try: |
| 157 | + pr_data[package]["messages"].remove(message) |
| 158 | + except ValueError: |
| 159 | + residual_old.append(message) |
| 160 | + |
| 161 | + # Second pass: fuzzy-match residuals to detect *changed* messages. |
| 162 | + paired, final_missing, final_new = _fuzzy_match_residuals( |
| 163 | + residual_old, pr_data[package]["messages"] |
| 164 | + ) |
| 165 | + |
| 166 | + self.missing_messages[package] = final_missing |
| 167 | + self.new_messages[package] = final_new |
| 168 | + self.changed_messages[package] = paired |
| 169 | + |
| 170 | + def __iter__( |
| 171 | + self, |
| 172 | + ) -> Iterator[ |
| 173 | + tuple[ |
| 174 | + str, |
| 175 | + list[OldJsonExport], |
| 176 | + list[OldJsonExport], |
| 177 | + list[ChangedMessage], |
| 178 | + ] |
| 179 | + ]: |
| 180 | + for package in self.missing_messages: |
| 181 | + missing = self.missing_messages[package] |
| 182 | + new = self.new_messages[package] |
| 183 | + changed = self.changed_messages[package] |
| 184 | + if not missing and not new and not changed: |
| 185 | + continue |
| 186 | + yield package, missing, new, changed |
| 187 | + |
| 188 | + @staticmethod |
| 189 | + def from_json( |
| 190 | + base_file: str, new_file: str, batches: int | None = None |
| 191 | + ) -> Comparator: |
| 192 | + """Build a Comparator from JSON file paths, handling batched runs.""" |
| 193 | + if batches is None: |
| 194 | + main_data = _load_json(base_file) |
| 195 | + pr_data = _load_json(new_file) |
| 196 | + else: |
| 197 | + main_data: PackageMessages = {} |
| 198 | + pr_data: PackageMessages = {} |
| 199 | + for idx in range(batches): |
| 200 | + main_data.update( |
| 201 | + _load_json(base_file.replace("BATCHIDX", "batch" + str(idx))) |
| 202 | + ) |
| 203 | + pr_data.update( |
| 204 | + _load_json(new_file.replace("BATCHIDX", "batch" + str(idx))) |
| 205 | + ) |
| 206 | + return Comparator(main_data, pr_data) |
| 207 | + |
| 208 | + |
| 209 | +def _load_json(file_path: Path | str) -> PackageMessages: |
| 210 | + with open(file_path, encoding="utf-8") as f: |
| 211 | + result: PackageMessages = json.load(f) |
| 212 | + return result |
0 commit comments