Skip to content

Commit 3ef0aaa

Browse files
Speed up duplicate-code checker with rolling hash and caching
Three optimizations to the symilar checker: - Rolling hash: compute the window hash incrementally (subtract the leaving line hash, add the entering one) instead of re-summing all k line hashes for every position. - Cache hash_lineset results per lineset in _iter_sims: each file was being hashed once per pair (N-1 times) instead of once total. - Remove the LinesChunk wrapper class and use plain int dict keys, so frozenset intersection and dict lookups use C-level hash/eq. ~=25% faster on astroid (17,5s => 12,5s, 25k SLOC) ~=70% faster on django (273s => 77s, 130k SLOC)
1 parent b080a21 commit 3ef0aaa

2 files changed

Lines changed: 70 additions & 77 deletions

File tree

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Speed up the ``duplicate-code`` checker by using C-based hash, a rolling hash window,
2+
and caching results across file pairs. Expect pylint to be ~25% faster on ~25k SLOC
3+
(astroid) and ~70% faster on ~130k SLOC (django) overall when duplicate-code is activated.
4+
5+
Refs #10881

pylint/checkers/symilar.py

Lines changed: 65 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
import copy
3434
import functools
3535
import itertools
36-
import operator
3736
import re
3837
import sys
3938
import warnings
@@ -71,9 +70,9 @@ class LineSpecifs(NamedTuple):
7170
text: str
7271

7372

74-
# Links LinesChunk object to the starting indices (in lineset's stripped lines)
75-
# of the different chunk of lines that are used to compute the hash
76-
HashToIndex_T = dict["LinesChunk", list[Index]]
73+
# Maps the hash of successive stripped lines to the starting indices
74+
# (in lineset's stripped lines) of the chunks that produced that hash.
75+
HashToIndex_T = dict[int, list[Index]]
7776

7877
# Links index in the lineset's stripped lines to the real lines in the file
7978
IndexToLines_T = dict[Index, "SuccessiveLinesLimits"]
@@ -105,45 +104,6 @@ def __init__(
105104
CplIndexToCplLines_T = dict["LineSetStartCouple", CplSuccessiveLinesLimits]
106105

107106

108-
class LinesChunk:
109-
"""The LinesChunk object computes and stores the hash of some consecutive stripped
110-
lines of a lineset.
111-
"""
112-
113-
__slots__ = ("_fileid", "_hash", "_index")
114-
115-
def __init__(self, fileid: str, num_line: int, *lines: Iterable[str]) -> None:
116-
self._fileid: str = fileid
117-
"""The name of the file from which the LinesChunk object is generated."""
118-
119-
self._index: Index = Index(num_line)
120-
"""The index in the stripped lines that is the starting of consecutive
121-
lines.
122-
"""
123-
124-
self._hash: int = sum(hash(lin) for lin in lines)
125-
"""The hash of some consecutive lines."""
126-
127-
def __eq__(self, o: object) -> bool:
128-
if not isinstance(o, LinesChunk):
129-
return NotImplemented
130-
return self._hash == o._hash
131-
132-
def __hash__(self) -> int:
133-
return self._hash
134-
135-
def __repr__(self) -> str:
136-
return (
137-
f"<LinesChunk object for file {self._fileid} ({self._index}, {self._hash})>"
138-
)
139-
140-
def __str__(self) -> str:
141-
return (
142-
f"LinesChunk object for file {self._fileid}, starting at line {self._index} \n"
143-
f"Hash is {self._hash}"
144-
)
145-
146-
147107
class SuccessiveLinesLimits:
148108
"""A class to handle the numbering of begin and end of successive lines.
149109
@@ -219,30 +179,41 @@ def hash_lineset(
219179
:return: a dict linking hashes to corresponding start index and a dict that links this
220180
index to the start and end lines in the file
221181
"""
222-
hash2index = defaultdict(list)
223-
index2lines = {}
224-
# Comments, docstring and other specific patterns maybe excluded -> call to stripped_lines
225-
# to get only what is desired
226-
lines = tuple(x.text for x in lineset.stripped_lines)
227-
# Need different iterators on same lines but each one is shifted 1 from the precedent
228-
shifted_lines = [iter(lines[i:]) for i in range(min_common_lines)]
229-
230-
for i, *succ_lines in enumerate(zip(*shifted_lines)):
231-
start_linenumber = LineNumber(lineset.stripped_lines[i].line_number)
232-
try:
233-
end_linenumber = lineset.stripped_lines[i + min_common_lines].line_number
234-
except IndexError:
235-
end_linenumber = LineNumber(lineset.stripped_lines[-1].line_number + 1)
182+
hash_to_index: HashToIndex_T = defaultdict(list)
183+
index_to_lines: IndexToLines_T = {}
184+
stripped = lineset.stripped_lines
185+
num_lines = len(stripped)
186+
if num_lines < min_common_lines:
187+
return hash_to_index, index_to_lines
188+
189+
# Pre-compute per-line hashes for the rolling window
190+
line_hashes = [hash(spec.text) for spec in stripped]
191+
192+
# Seed the rolling hash with the first window
193+
window_hash = sum(line_hashes[:min_common_lines])
194+
195+
last_index = num_lines - 1
196+
for i in range(num_lines - min_common_lines + 1):
197+
start_linenumber = LineNumber(stripped[i].line_number)
198+
window_end = i + min_common_lines
199+
end_linenumber = (
200+
stripped[window_end].line_number
201+
if window_end <= last_index
202+
else LineNumber(stripped[last_index].line_number + 1)
203+
)
236204

237205
index = Index(i)
238-
index2lines[index] = SuccessiveLinesLimits(
206+
index_to_lines[index] = SuccessiveLinesLimits(
239207
start=start_linenumber, end=end_linenumber
240208
)
241209

242-
l_c = LinesChunk(lineset.name, index, *succ_lines)
243-
hash2index[l_c].append(index)
210+
hash_to_index[window_hash].append(index)
211+
212+
# Slide the window: subtract the leaving line, add the entering line
213+
if window_end <= last_index:
214+
window_hash = window_hash - line_hashes[i] + line_hashes[window_end]
244215

245-
return hash2index, index2lines
216+
return hash_to_index, index_to_lines
246217

247218

248219
def remove_successive(all_couples: CplIndexToCplLines_T) -> None:
@@ -465,7 +436,11 @@ def _get_similarity_report(
465436

466437
# pylint: disable = too-many-locals
467438
def _find_common(
468-
self, lineset1: LineSet, lineset2: LineSet
439+
self,
440+
lineset1: LineSet,
441+
lineset2: LineSet,
442+
hashes1: tuple[HashToIndex_T, IndexToLines_T] | None = None,
443+
hashes2: tuple[HashToIndex_T, IndexToLines_T] | None = None,
469444
) -> Generator[Commonality]:
470445
"""Find similarities in the two given linesets.
471446
@@ -483,27 +458,28 @@ def _find_common(
483458
hash_to_index_2: HashToIndex_T
484459
index_to_lines_1: IndexToLines_T
485460
index_to_lines_2: IndexToLines_T
486-
hash_to_index_1, index_to_lines_1 = hash_lineset(
487-
lineset1, self.namespace.min_similarity_lines
488-
)
489-
hash_to_index_2, index_to_lines_2 = hash_lineset(
490-
lineset2, self.namespace.min_similarity_lines
491-
)
492-
493-
hash_1: frozenset[LinesChunk] = frozenset(hash_to_index_1.keys())
494-
hash_2: frozenset[LinesChunk] = frozenset(hash_to_index_2.keys())
461+
if hashes1 is not None:
462+
hash_to_index_1, index_to_lines_1 = hashes1
463+
else:
464+
hash_to_index_1, index_to_lines_1 = hash_lineset(
465+
lineset1, self.namespace.min_similarity_lines
466+
)
467+
if hashes2 is not None:
468+
hash_to_index_2, index_to_lines_2 = hashes2
469+
else:
470+
hash_to_index_2, index_to_lines_2 = hash_lineset(
471+
lineset2, self.namespace.min_similarity_lines
472+
)
495473

496-
common_hashes: Iterable[LinesChunk] = sorted(
497-
hash_1 & hash_2, key=lambda m: hash_to_index_1[m][0]
498-
)
474+
common_hashes = hash_to_index_1.keys() & hash_to_index_2.keys()
499475

500476
# all_couples is a dict that links the couple of indices in both linesets that mark the beginning of
501477
# successive common lines, to the corresponding starting and ending number lines in both files
502478
all_couples: CplIndexToCplLines_T = {}
503479

504-
for c_hash in sorted(common_hashes, key=operator.attrgetter("_index")):
480+
for chunk_hash in sorted(common_hashes, key=lambda h: hash_to_index_1[h][0]):
505481
for indices_in_linesets in itertools.product(
506-
hash_to_index_1[c_hash], hash_to_index_2[c_hash]
482+
hash_to_index_1[chunk_hash], hash_to_index_2[chunk_hash]
507483
):
508484
index_1 = indices_in_linesets[0]
509485
index_2 = indices_in_linesets[1]
@@ -543,9 +519,21 @@ def _iter_sims(self) -> Generator[Commonality]:
543519
"""Iterate on similarities among all files, by making a Cartesian
544520
product.
545521
"""
522+
min_lines = self.namespace.min_similarity_lines
523+
# Cache hash_lineset results: each lineset is compared against every
524+
# other, so without caching it gets hashed (N-1) times.
525+
cache: dict[int, tuple[HashToIndex_T, IndexToLines_T]] = {}
546526
for idx, lineset in enumerate(self.linesets[:-1]):
547527
for lineset2 in self.linesets[idx + 1 :]:
548-
yield from self._find_common(lineset, lineset2)
528+
lid1 = id(lineset)
529+
if lid1 not in cache:
530+
cache[lid1] = hash_lineset(lineset, min_lines)
531+
lid2 = id(lineset2)
532+
if lid2 not in cache:
533+
cache[lid2] = hash_lineset(lineset2, min_lines)
534+
yield from self._find_common(
535+
lineset, lineset2, cache[lid1], cache[lid2]
536+
)
549537

550538
def get_map_data(self) -> list[LineSet]:
551539
"""Returns the data we can use for a map/reduce process.

0 commit comments

Comments
 (0)