Skip to content

Commit 9b40232

Browse files
Introduce LineSetHashResult and make hash params required in _find_common
Replace the raw tuple[HashToIndex_T, IndexToLines_T] with a LineSetHashResult NamedTuple for clarity. Since _iter_sims always passes pre-computed hashes from its cache, make the parameters required and remove the unused fallback branches.
1 parent 3ef0aaa commit 9b40232

1 file changed

Lines changed: 36 additions & 44 deletions

File tree

pylint/checkers/symilar.py

Lines changed: 36 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,14 @@ class LineSpecifs(NamedTuple):
7777
# Links index in the lineset's stripped lines to the real lines in the file
7878
IndexToLines_T = dict[Index, "SuccessiveLinesLimits"]
7979

80+
81+
class LineSetHashResult(NamedTuple):
82+
"""Pre-computed hash data for a LineSet, used to speed up similarity lookup."""
83+
84+
hash_to_index: HashToIndex_T
85+
index_to_lines: IndexToLines_T
86+
87+
8088
# The types the streams read by pylint can take. Originating from astroid.nodes.Module.stream() and open()
8189
STREAM_TYPES: TypeAlias = TextIO | BufferedReader | BytesIO
8290

@@ -166,25 +174,25 @@ def increment(self, value: Index) -> LineSetStartCouple:
166174

167175
def hash_lineset(
168176
lineset: LineSet, min_common_lines: int = DEFAULT_MIN_SIMILARITY_LINE
169-
) -> tuple[HashToIndex_T, IndexToLines_T]:
170-
"""Return two dicts.
177+
) -> LineSetHashResult:
178+
"""Return pre-computed hash data for a lineset.
171179
172-
The first associates the hash of successive stripped lines of a lineset
173-
to the indices of the starting lines.
174-
The second dict, associates the index of the starting line in the lineset's stripped lines to the
175-
couple [start, end] lines number in the corresponding file.
180+
The result contains two dicts:
181+
- hash_to_index: maps the hash of successive stripped lines to the starting
182+
indices (in lineset's stripped lines) of the chunks that produced that hash.
183+
- index_to_lines: maps the index of the starting line in the lineset's stripped
184+
lines to the start and end line numbers in the corresponding file.
176185
177186
:param lineset: lineset object (i.e the lines in a file)
178187
:param min_common_lines: number of successive lines that are used to compute the hash
179-
:return: a dict linking hashes to corresponding start index and a dict that links this
180-
index to the start and end lines in the file
188+
:return: a LineSetHashResult with hash-to-index and index-to-lines mappings
181189
"""
182190
hash_to_index: HashToIndex_T = defaultdict(list)
183191
index_to_lines: IndexToLines_T = {}
184192
stripped = lineset.stripped_lines
185193
num_lines = len(stripped)
186194
if num_lines < min_common_lines:
187-
return hash_to_index, index_to_lines
195+
return LineSetHashResult(hash_to_index, index_to_lines)
188196

189197
# Pre-compute per-line hashes for the rolling window
190198
line_hashes = [hash(spec.text) for spec in stripped]
@@ -213,7 +221,7 @@ def hash_lineset(
213221
if window_end <= last_index:
214222
window_hash = window_hash - line_hashes[i] + line_hashes[window_end]
215223

216-
return hash_to_index, index_to_lines
224+
return LineSetHashResult(hash_to_index, index_to_lines)
217225

218226

219227
def remove_successive(all_couples: CplIndexToCplLines_T) -> None:
@@ -434,13 +442,12 @@ def _get_similarity_report(
434442
)
435443
return report
436444

437-
# pylint: disable = too-many-locals
438445
def _find_common(
439446
self,
440447
lineset1: LineSet,
441448
lineset2: LineSet,
442-
hashes1: tuple[HashToIndex_T, IndexToLines_T] | None = None,
443-
hashes2: tuple[HashToIndex_T, IndexToLines_T] | None = None,
449+
hashes1: LineSetHashResult,
450+
hashes2: LineSetHashResult,
444451
) -> Generator[Commonality]:
445452
"""Find similarities in the two given linesets.
446453
@@ -454,39 +461,24 @@ def _find_common(
454461
account common chunk of lines that have more than the minimal number of
455462
successive lines required.
456463
"""
457-
hash_to_index_1: HashToIndex_T
458-
hash_to_index_2: HashToIndex_T
459-
index_to_lines_1: IndexToLines_T
460-
index_to_lines_2: IndexToLines_T
461-
if hashes1 is not None:
462-
hash_to_index_1, index_to_lines_1 = hashes1
463-
else:
464-
hash_to_index_1, index_to_lines_1 = hash_lineset(
465-
lineset1, self.namespace.min_similarity_lines
466-
)
467-
if hashes2 is not None:
468-
hash_to_index_2, index_to_lines_2 = hashes2
469-
else:
470-
hash_to_index_2, index_to_lines_2 = hash_lineset(
471-
lineset2, self.namespace.min_similarity_lines
472-
)
473-
474-
common_hashes = hash_to_index_1.keys() & hash_to_index_2.keys()
464+
common_hashes = hashes1.hash_to_index.keys() & hashes2.hash_to_index.keys()
475465

476466
# all_couples is a dict that links the couple of indices in both linesets that mark the beginning of
477467
# successive common lines, to the corresponding starting and ending number lines in both files
478468
all_couples: CplIndexToCplLines_T = {}
479469

480-
for chunk_hash in sorted(common_hashes, key=lambda h: hash_to_index_1[h][0]):
470+
for chunk_hash in sorted(
471+
common_hashes, key=lambda h: hashes1.hash_to_index[h][0]
472+
):
481473
for indices_in_linesets in itertools.product(
482-
hash_to_index_1[chunk_hash], hash_to_index_2[chunk_hash]
474+
hashes1.hash_to_index[chunk_hash], hashes2.hash_to_index[chunk_hash]
483475
):
484476
index_1 = indices_in_linesets[0]
485477
index_2 = indices_in_linesets[1]
486478
all_couples[LineSetStartCouple(index_1, index_2)] = (
487479
CplSuccessiveLinesLimits(
488-
copy.copy(index_to_lines_1[index_1]),
489-
copy.copy(index_to_lines_2[index_2]),
480+
copy.copy(hashes1.index_to_lines[index_1]),
481+
copy.copy(hashes2.index_to_lines[index_2]),
490482
effective_cmn_lines_nb=self.namespace.min_similarity_lines,
491483
)
492484
)
@@ -522,17 +514,17 @@ def _iter_sims(self) -> Generator[Commonality]:
522514
min_lines = self.namespace.min_similarity_lines
523515
# Cache hash_lineset results: each lineset is compared against every
524516
# other, so without caching it gets hashed (N-1) times.
525-
cache: dict[int, tuple[HashToIndex_T, IndexToLines_T]] = {}
526-
for idx, lineset in enumerate(self.linesets[:-1]):
517+
cache: dict[int, LineSetHashResult] = {}
518+
for idx, lineset1 in enumerate(self.linesets[:-1]):
527519
for lineset2 in self.linesets[idx + 1 :]:
528-
lid1 = id(lineset)
529-
if lid1 not in cache:
530-
cache[lid1] = hash_lineset(lineset, min_lines)
531-
lid2 = id(lineset2)
532-
if lid2 not in cache:
533-
cache[lid2] = hash_lineset(lineset2, min_lines)
520+
key1 = id(lineset1)
521+
if key1 not in cache:
522+
cache[key1] = hash_lineset(lineset1, min_lines)
523+
key2 = id(lineset2)
524+
if key2 not in cache:
525+
cache[key2] = hash_lineset(lineset2, min_lines)
534526
yield from self._find_common(
535-
lineset, lineset2, cache[lid1], cache[lid2]
527+
lineset1, lineset2, cache[key1], cache[key2]
536528
)
537529

538530
def get_map_data(self) -> list[LineSet]:

0 commit comments

Comments
 (0)