From 3ef0aaa04259ad40ea36a0090ddf854e74472507 Mon Sep 17 00:00:00 2001 From: Pierre Sassoulas Date: Sun, 1 Mar 2026 22:49:23 +0100 Subject: [PATCH 1/7] Speed up duplicate-code checker with rolling hash and caching Three optimizations to the symilar checker: - Rolling hash: compute the window hash incrementally (subtract the leaving line hash, add the entering one) instead of re-summing all k line hashes for every position. - Cache hash_lineset results per lineset in _iter_sims: each file was being hashed once per pair (N-1 times) instead of once total. - Remove the LinesChunk wrapper class and use plain int dict keys, so frozenset intersection and dict lookups use C-level hash/eq. ~=25% faster on astroid (17,5s => 12,5s, 25k SLOC) ~=70% faster on django (273s => 77s, 130k SLOC) --- doc/whatsnew/fragments/10881.performance | 5 + pylint/checkers/symilar.py | 142 +++++++++++------------ 2 files changed, 70 insertions(+), 77 deletions(-) create mode 100644 doc/whatsnew/fragments/10881.performance diff --git a/doc/whatsnew/fragments/10881.performance b/doc/whatsnew/fragments/10881.performance new file mode 100644 index 0000000000..a8bfa495e6 --- /dev/null +++ b/doc/whatsnew/fragments/10881.performance @@ -0,0 +1,5 @@ +Speed up the ``duplicate-code`` checker by using C-based hash, a rolling hash window, +and caching results across file pairs. Expect pylint to be ~25% faster on ~25k SLOC +(astroid) and ~70% faster on ~130k SLOC (django) overall when duplicate-code is activated. + +Refs #10881 diff --git a/pylint/checkers/symilar.py b/pylint/checkers/symilar.py index 8d504f3364..23ef700859 100644 --- a/pylint/checkers/symilar.py +++ b/pylint/checkers/symilar.py @@ -33,7 +33,6 @@ import copy import functools import itertools -import operator import re import sys import warnings @@ -71,9 +70,9 @@ class LineSpecifs(NamedTuple): text: str -# Links LinesChunk object to the starting indices (in lineset's stripped lines) -# of the different chunk of lines that are used to compute the hash -HashToIndex_T = dict["LinesChunk", list[Index]] +# Maps the hash of successive stripped lines to the starting indices +# (in lineset's stripped lines) of the chunks that produced that hash. +HashToIndex_T = dict[int, list[Index]] # Links index in the lineset's stripped lines to the real lines in the file IndexToLines_T = dict[Index, "SuccessiveLinesLimits"] @@ -105,45 +104,6 @@ def __init__( CplIndexToCplLines_T = dict["LineSetStartCouple", CplSuccessiveLinesLimits] -class LinesChunk: - """The LinesChunk object computes and stores the hash of some consecutive stripped - lines of a lineset. - """ - - __slots__ = ("_fileid", "_hash", "_index") - - def __init__(self, fileid: str, num_line: int, *lines: Iterable[str]) -> None: - self._fileid: str = fileid - """The name of the file from which the LinesChunk object is generated.""" - - self._index: Index = Index(num_line) - """The index in the stripped lines that is the starting of consecutive - lines. - """ - - self._hash: int = sum(hash(lin) for lin in lines) - """The hash of some consecutive lines.""" - - def __eq__(self, o: object) -> bool: - if not isinstance(o, LinesChunk): - return NotImplemented - return self._hash == o._hash - - def __hash__(self) -> int: - return self._hash - - def __repr__(self) -> str: - return ( - f"" - ) - - def __str__(self) -> str: - return ( - f"LinesChunk object for file {self._fileid}, starting at line {self._index} \n" - f"Hash is {self._hash}" - ) - - class SuccessiveLinesLimits: """A class to handle the numbering of begin and end of successive lines. @@ -219,30 +179,41 @@ def hash_lineset( :return: a dict linking hashes to corresponding start index and a dict that links this index to the start and end lines in the file """ - hash2index = defaultdict(list) - index2lines = {} - # Comments, docstring and other specific patterns maybe excluded -> call to stripped_lines - # to get only what is desired - lines = tuple(x.text for x in lineset.stripped_lines) - # Need different iterators on same lines but each one is shifted 1 from the precedent - shifted_lines = [iter(lines[i:]) for i in range(min_common_lines)] - - for i, *succ_lines in enumerate(zip(*shifted_lines)): - start_linenumber = LineNumber(lineset.stripped_lines[i].line_number) - try: - end_linenumber = lineset.stripped_lines[i + min_common_lines].line_number - except IndexError: - end_linenumber = LineNumber(lineset.stripped_lines[-1].line_number + 1) + hash_to_index: HashToIndex_T = defaultdict(list) + index_to_lines: IndexToLines_T = {} + stripped = lineset.stripped_lines + num_lines = len(stripped) + if num_lines < min_common_lines: + return hash_to_index, index_to_lines + + # Pre-compute per-line hashes for the rolling window + line_hashes = [hash(spec.text) for spec in stripped] + + # Seed the rolling hash with the first window + window_hash = sum(line_hashes[:min_common_lines]) + + last_index = num_lines - 1 + for i in range(num_lines - min_common_lines + 1): + start_linenumber = LineNumber(stripped[i].line_number) + window_end = i + min_common_lines + end_linenumber = ( + stripped[window_end].line_number + if window_end <= last_index + else LineNumber(stripped[last_index].line_number + 1) + ) index = Index(i) - index2lines[index] = SuccessiveLinesLimits( + index_to_lines[index] = SuccessiveLinesLimits( start=start_linenumber, end=end_linenumber ) - l_c = LinesChunk(lineset.name, index, *succ_lines) - hash2index[l_c].append(index) + hash_to_index[window_hash].append(index) + + # Slide the window: subtract the leaving line, add the entering line + if window_end <= last_index: + window_hash = window_hash - line_hashes[i] + line_hashes[window_end] - return hash2index, index2lines + return hash_to_index, index_to_lines def remove_successive(all_couples: CplIndexToCplLines_T) -> None: @@ -465,7 +436,11 @@ def _get_similarity_report( # pylint: disable = too-many-locals def _find_common( - self, lineset1: LineSet, lineset2: LineSet + self, + lineset1: LineSet, + lineset2: LineSet, + hashes1: tuple[HashToIndex_T, IndexToLines_T] | None = None, + hashes2: tuple[HashToIndex_T, IndexToLines_T] | None = None, ) -> Generator[Commonality]: """Find similarities in the two given linesets. @@ -483,27 +458,28 @@ def _find_common( hash_to_index_2: HashToIndex_T index_to_lines_1: IndexToLines_T index_to_lines_2: IndexToLines_T - hash_to_index_1, index_to_lines_1 = hash_lineset( - lineset1, self.namespace.min_similarity_lines - ) - hash_to_index_2, index_to_lines_2 = hash_lineset( - lineset2, self.namespace.min_similarity_lines - ) - - hash_1: frozenset[LinesChunk] = frozenset(hash_to_index_1.keys()) - hash_2: frozenset[LinesChunk] = frozenset(hash_to_index_2.keys()) + if hashes1 is not None: + hash_to_index_1, index_to_lines_1 = hashes1 + else: + hash_to_index_1, index_to_lines_1 = hash_lineset( + lineset1, self.namespace.min_similarity_lines + ) + if hashes2 is not None: + hash_to_index_2, index_to_lines_2 = hashes2 + else: + hash_to_index_2, index_to_lines_2 = hash_lineset( + lineset2, self.namespace.min_similarity_lines + ) - common_hashes: Iterable[LinesChunk] = sorted( - hash_1 & hash_2, key=lambda m: hash_to_index_1[m][0] - ) + common_hashes = hash_to_index_1.keys() & hash_to_index_2.keys() # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of # successive common lines, to the corresponding starting and ending number lines in both files all_couples: CplIndexToCplLines_T = {} - for c_hash in sorted(common_hashes, key=operator.attrgetter("_index")): + for chunk_hash in sorted(common_hashes, key=lambda h: hash_to_index_1[h][0]): for indices_in_linesets in itertools.product( - hash_to_index_1[c_hash], hash_to_index_2[c_hash] + hash_to_index_1[chunk_hash], hash_to_index_2[chunk_hash] ): index_1 = indices_in_linesets[0] index_2 = indices_in_linesets[1] @@ -543,9 +519,21 @@ def _iter_sims(self) -> Generator[Commonality]: """Iterate on similarities among all files, by making a Cartesian product. """ + min_lines = self.namespace.min_similarity_lines + # Cache hash_lineset results: each lineset is compared against every + # other, so without caching it gets hashed (N-1) times. + cache: dict[int, tuple[HashToIndex_T, IndexToLines_T]] = {} for idx, lineset in enumerate(self.linesets[:-1]): for lineset2 in self.linesets[idx + 1 :]: - yield from self._find_common(lineset, lineset2) + lid1 = id(lineset) + if lid1 not in cache: + cache[lid1] = hash_lineset(lineset, min_lines) + lid2 = id(lineset2) + if lid2 not in cache: + cache[lid2] = hash_lineset(lineset2, min_lines) + yield from self._find_common( + lineset, lineset2, cache[lid1], cache[lid2] + ) def get_map_data(self) -> list[LineSet]: """Returns the data we can use for a map/reduce process. From 9b40232d2becd3f8bbad5a3c42f9fe3143f2a02e Mon Sep 17 00:00:00 2001 From: Pierre Sassoulas Date: Mon, 2 Mar 2026 15:35:27 +0100 Subject: [PATCH 2/7] Introduce LineSetHashResult and make hash params required in _find_common Replace the raw tuple[HashToIndex_T, IndexToLines_T] with a LineSetHashResult NamedTuple for clarity. Since _iter_sims always passes pre-computed hashes from its cache, make the parameters required and remove the unused fallback branches. --- pylint/checkers/symilar.py | 80 +++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/pylint/checkers/symilar.py b/pylint/checkers/symilar.py index 23ef700859..156da6fa21 100644 --- a/pylint/checkers/symilar.py +++ b/pylint/checkers/symilar.py @@ -77,6 +77,14 @@ class LineSpecifs(NamedTuple): # Links index in the lineset's stripped lines to the real lines in the file IndexToLines_T = dict[Index, "SuccessiveLinesLimits"] + +class LineSetHashResult(NamedTuple): + """Pre-computed hash data for a LineSet, used to speed up similarity lookup.""" + + hash_to_index: HashToIndex_T + index_to_lines: IndexToLines_T + + # The types the streams read by pylint can take. Originating from astroid.nodes.Module.stream() and open() STREAM_TYPES: TypeAlias = TextIO | BufferedReader | BytesIO @@ -166,25 +174,25 @@ def increment(self, value: Index) -> LineSetStartCouple: def hash_lineset( lineset: LineSet, min_common_lines: int = DEFAULT_MIN_SIMILARITY_LINE -) -> tuple[HashToIndex_T, IndexToLines_T]: - """Return two dicts. +) -> LineSetHashResult: + """Return pre-computed hash data for a lineset. - The first associates the hash of successive stripped lines of a lineset - to the indices of the starting lines. - The second dict, associates the index of the starting line in the lineset's stripped lines to the - couple [start, end] lines number in the corresponding file. + The result contains two dicts: + - hash_to_index: maps the hash of successive stripped lines to the starting + indices (in lineset's stripped lines) of the chunks that produced that hash. + - index_to_lines: maps the index of the starting line in the lineset's stripped + lines to the start and end line numbers in the corresponding file. :param lineset: lineset object (i.e the lines in a file) :param min_common_lines: number of successive lines that are used to compute the hash - :return: a dict linking hashes to corresponding start index and a dict that links this - index to the start and end lines in the file + :return: a LineSetHashResult with hash-to-index and index-to-lines mappings """ hash_to_index: HashToIndex_T = defaultdict(list) index_to_lines: IndexToLines_T = {} stripped = lineset.stripped_lines num_lines = len(stripped) if num_lines < min_common_lines: - return hash_to_index, index_to_lines + return LineSetHashResult(hash_to_index, index_to_lines) # Pre-compute per-line hashes for the rolling window line_hashes = [hash(spec.text) for spec in stripped] @@ -213,7 +221,7 @@ def hash_lineset( if window_end <= last_index: window_hash = window_hash - line_hashes[i] + line_hashes[window_end] - return hash_to_index, index_to_lines + return LineSetHashResult(hash_to_index, index_to_lines) def remove_successive(all_couples: CplIndexToCplLines_T) -> None: @@ -434,13 +442,12 @@ def _get_similarity_report( ) return report - # pylint: disable = too-many-locals def _find_common( self, lineset1: LineSet, lineset2: LineSet, - hashes1: tuple[HashToIndex_T, IndexToLines_T] | None = None, - hashes2: tuple[HashToIndex_T, IndexToLines_T] | None = None, + hashes1: LineSetHashResult, + hashes2: LineSetHashResult, ) -> Generator[Commonality]: """Find similarities in the two given linesets. @@ -454,39 +461,24 @@ def _find_common( account common chunk of lines that have more than the minimal number of successive lines required. """ - hash_to_index_1: HashToIndex_T - hash_to_index_2: HashToIndex_T - index_to_lines_1: IndexToLines_T - index_to_lines_2: IndexToLines_T - if hashes1 is not None: - hash_to_index_1, index_to_lines_1 = hashes1 - else: - hash_to_index_1, index_to_lines_1 = hash_lineset( - lineset1, self.namespace.min_similarity_lines - ) - if hashes2 is not None: - hash_to_index_2, index_to_lines_2 = hashes2 - else: - hash_to_index_2, index_to_lines_2 = hash_lineset( - lineset2, self.namespace.min_similarity_lines - ) - - common_hashes = hash_to_index_1.keys() & hash_to_index_2.keys() + common_hashes = hashes1.hash_to_index.keys() & hashes2.hash_to_index.keys() # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of # successive common lines, to the corresponding starting and ending number lines in both files all_couples: CplIndexToCplLines_T = {} - for chunk_hash in sorted(common_hashes, key=lambda h: hash_to_index_1[h][0]): + for chunk_hash in sorted( + common_hashes, key=lambda h: hashes1.hash_to_index[h][0] + ): for indices_in_linesets in itertools.product( - hash_to_index_1[chunk_hash], hash_to_index_2[chunk_hash] + hashes1.hash_to_index[chunk_hash], hashes2.hash_to_index[chunk_hash] ): index_1 = indices_in_linesets[0] index_2 = indices_in_linesets[1] all_couples[LineSetStartCouple(index_1, index_2)] = ( CplSuccessiveLinesLimits( - copy.copy(index_to_lines_1[index_1]), - copy.copy(index_to_lines_2[index_2]), + copy.copy(hashes1.index_to_lines[index_1]), + copy.copy(hashes2.index_to_lines[index_2]), effective_cmn_lines_nb=self.namespace.min_similarity_lines, ) ) @@ -522,17 +514,17 @@ def _iter_sims(self) -> Generator[Commonality]: min_lines = self.namespace.min_similarity_lines # Cache hash_lineset results: each lineset is compared against every # other, so without caching it gets hashed (N-1) times. - cache: dict[int, tuple[HashToIndex_T, IndexToLines_T]] = {} - for idx, lineset in enumerate(self.linesets[:-1]): + cache: dict[int, LineSetHashResult] = {} + for idx, lineset1 in enumerate(self.linesets[:-1]): for lineset2 in self.linesets[idx + 1 :]: - lid1 = id(lineset) - if lid1 not in cache: - cache[lid1] = hash_lineset(lineset, min_lines) - lid2 = id(lineset2) - if lid2 not in cache: - cache[lid2] = hash_lineset(lineset2, min_lines) + key1 = id(lineset1) + if key1 not in cache: + cache[key1] = hash_lineset(lineset1, min_lines) + key2 = id(lineset2) + if key2 not in cache: + cache[key2] = hash_lineset(lineset2, min_lines) yield from self._find_common( - lineset, lineset2, cache[lid1], cache[lid2] + lineset1, lineset2, cache[key1], cache[key2] ) def get_map_data(self) -> list[LineSet]: From 8027e6cc194f7ac9d6df892a0f89eabf227fd257 Mon Sep 17 00:00:00 2001 From: Pierre Sassoulas Date: Tue, 3 Mar 2026 09:44:44 +0100 Subject: [PATCH 3/7] Reuse parsed AST in stripped_lines to avoid redundant astroid.parse() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit process_module already receives the parsed nodes.Module from pylint's main pass, but stripped_lines was re-parsing every file from source text. Thread the existing AST through process_module → append_stream → LineSet → stripped_lines, falling back to astroid.parse() only when no tree is provided (standalone symilar CLI). The redundant parse dominated stripped_lines cost. Per-file savings: file size time saved memory saved 0 lines 0.14 ms 0.03 MB 924 lines 65 ms 2.1 MB 31k lines 2764 ms 101.6 MB End-to-end on pylint's own codebase (179 files, ~49k SLOC): before median=6.6s peak RSS=170 MB after median=5.1s peak RSS=149 MB (1.5x faster, -12% memory) --- pylint/checkers/symilar.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/pylint/checkers/symilar.py b/pylint/checkers/symilar.py index 156da6fa21..e87fa07497 100644 --- a/pylint/checkers/symilar.py +++ b/pylint/checkers/symilar.py @@ -336,7 +336,11 @@ def __init__( self.linesets: list[LineSet] = [] def append_stream( - self, streamid: str, stream: STREAM_TYPES, encoding: str | None = None + self, + streamid: str, + stream: STREAM_TYPES, + encoding: str | None = None, + tree: nodes.Module | None = None, ) -> None: """Append a file to search for similarities.""" if isinstance(stream, BufferedIOBase): @@ -365,6 +369,7 @@ def append_stream( if hasattr(self, "linter") else None ), + tree=tree, ) ) @@ -550,6 +555,7 @@ def stripped_lines( ignore_imports: bool, ignore_signatures: bool, line_enabled_callback: Callable[[str, int], bool] | None = None, + tree: nodes.Module | None = None, ) -> list[LineSpecifs]: """Return tuples of line/line number/line type with leading/trailing white-space and any ignored code features removed. @@ -561,11 +567,13 @@ def stripped_lines( :param ignore_signatures: if true, any line that is part of a function signature is removed from the result :param line_enabled_callback: If called with "R0801" and a line number, a return value of False will disregard the line + :param tree: pre-parsed AST; when provided the redundant astroid.parse() call is skipped :return: the collection of line/line number/line type tuples """ ignore_lines: set[int] = set() if ignore_imports or ignore_signatures: - tree = astroid.parse("".join(lines)) + if tree is None: + tree = astroid.parse("".join(lines)) if ignore_imports: ignore_lines.update( chain.from_iterable( @@ -654,6 +662,7 @@ def __init__( ignore_imports: bool = False, ignore_signatures: bool = False, line_enabled_callback: Callable[[str, int], bool] | None = None, + tree: nodes.Module | None = None, ) -> None: self.name = name self._real_lines = lines @@ -664,6 +673,7 @@ def __init__( ignore_imports, ignore_signatures, line_enabled_callback=line_enabled_callback, + tree=tree, ) def __str__(self) -> str: @@ -816,7 +826,9 @@ def process_module(self, node: nodes.Module) -> None: stacklevel=2, ) with node.stream() as stream: - self.append_stream(self.linter.current_name, stream, node.file_encoding) + self.append_stream( + self.linter.current_name, stream, node.file_encoding, tree=node + ) def close(self) -> None: """Compute and display similarities on closing (i.e. end of parsing).""" From dae5b833ceb0fb24a04f56f84a77521d9af8da86 Mon Sep 17 00:00:00 2001 From: Pierre Sassoulas Date: Tue, 3 Mar 2026 10:15:24 +0100 Subject: [PATCH 4/7] Cap hash bucket Cartesian product to prevent quadratic blow-up When two files share a hash bucket with many indices (e.g. thousands of identical data lines), itertools.product explodes quadratically (4k x 22k = 88M pairs). Fall back to aligned zip pairing when the product exceeds 500 entries. The diagonal pairs are consecutive so remove_successive still coalesces them into one correct block. Benchmark on psf/black (316 files, 129k lines including pathological profiling/ data files with 22k+ identical lines): hung forever -> 7.2s. --- doc/whatsnew/4/4.1/index.rst | 5 +++ doc/whatsnew/fragments/10881.performance | 15 +++++-- pylint/checkers/symilar.py | 51 +++++++++++++++--------- 3 files changed, 50 insertions(+), 21 deletions(-) diff --git a/doc/whatsnew/4/4.1/index.rst b/doc/whatsnew/4/4.1/index.rst index 9fb075b671..b6c65db818 100644 --- a/doc/whatsnew/4/4.1/index.rst +++ b/doc/whatsnew/4/4.1/index.rst @@ -12,6 +12,11 @@ Summary -- Release highlights ============================= +The duplicate-code checker and ``symilar`` received optimizations that +result in considerable performance improvements and memory use reduction +on larger codebases. For example, pandas analysis went from 20 min to +55 s and pylint does not get OOM-killed when analyzing cpython anymore. + The required ``astroid`` version is now 4.1.1. See the `astroid changelog `_ for additional fixes, features, and performance improvements applicable to pylint. diff --git a/doc/whatsnew/fragments/10881.performance b/doc/whatsnew/fragments/10881.performance index a8bfa495e6..f9ced17eaf 100644 --- a/doc/whatsnew/fragments/10881.performance +++ b/doc/whatsnew/fragments/10881.performance @@ -1,5 +1,14 @@ -Speed up the ``duplicate-code`` checker by using C-based hash, a rolling hash window, -and caching results across file pairs. Expect pylint to be ~25% faster on ~25k SLOC -(astroid) and ~70% faster on ~130k SLOC (django) overall when duplicate-code is activated. +Sped up the ``duplicate-code`` checker. When run inside pylint the +checker now reuses the already-parsed AST instead of re-parsing every +file like it has to do when launched via ``symilar``, and it uses a +rolling hash window with caching across file pairs. Additionally, a +quadratic blow-up in the hash-matching phase is avoided by switching +algorithm at a threshold, which previously caused the checker to hang +on files with many repeated lines. + +Speedup scales with codebase size from 1.5x on small projects +(~10k lines), to 20x on large ones (500k+ lines). Memory usage also +drops 12-27%. Codebases that previously hung or were OOM-killed could +now complete. Refs #10881 diff --git a/pylint/checkers/symilar.py b/pylint/checkers/symilar.py index e87fa07497..e986ab1fd5 100644 --- a/pylint/checkers/symilar.py +++ b/pylint/checkers/symilar.py @@ -57,6 +57,12 @@ REGEX_FOR_LINES_WITH_CONTENT = re.compile(r".*\w+") +# When two files share a hash bucket whose Cartesian product exceeds this +# limit, fall back to aligned (zip) pairing instead of the full product. +# This prevents quadratic blow-up on files with many identical lines (e.g. +# auto-generated data). The result is a correct lower bound on duplicates. +_HASH_BUCKET_PRODUCT_LIMIT: int = 500 + # Index defines a location in a LineSet stripped lines collection Index = NewType("Index", int) @@ -475,11 +481,19 @@ def _find_common( for chunk_hash in sorted( common_hashes, key=lambda h: hashes1.hash_to_index[h][0] ): - for indices_in_linesets in itertools.product( - hashes1.hash_to_index[chunk_hash], hashes2.hash_to_index[chunk_hash] - ): - index_1 = indices_in_linesets[0] - index_2 = indices_in_linesets[1] + indices_1 = hashes1.hash_to_index[chunk_hash] + indices_2 = hashes2.hash_to_index[chunk_hash] + + # When both buckets are large the Cartesian product becomes + # quadratic (e.g. 4000 x 22000 = 88M pairs for repeated data + # lines). Fall back to aligned pairing which is O(min(N, M)) + # and still lets remove_successive coalesce consecutive matches. + if len(indices_1) * len(indices_2) > _HASH_BUCKET_PRODUCT_LIMIT: + pairs: Iterable[tuple[Index, Index]] = zip(indices_1, indices_2) + else: + pairs = itertools.product(indices_1, indices_2) + + for index_1, index_2 in pairs: all_couples[LineSetStartCouple(index_1, index_2)] = ( CplSuccessiveLinesLimits( copy.copy(hashes1.index_to_lines[index_1]), @@ -493,24 +507,25 @@ def _find_common( for cml_stripped_l, cmn_l in all_couples.items(): start_index_1 = cml_stripped_l.fst_lineset_index start_index_2 = cml_stripped_l.snd_lineset_index - nb_common_lines = cmn_l.effective_cmn_lines_nb - - com = Commonality( - cmn_lines_nb=nb_common_lines, - fst_lset=lineset1, - fst_file_start=cmn_l.first_file.start, - fst_file_end=cmn_l.first_file.end, - snd_lset=lineset2, - snd_file_start=cmn_l.second_file.start, - snd_file_end=cmn_l.second_file.end, - ) eff_cmn_nb = filter_noncode_lines( - lineset1, start_index_1, lineset2, start_index_2, nb_common_lines + lineset1, + start_index_1, + lineset2, + start_index_2, + cmn_l.effective_cmn_lines_nb, ) if eff_cmn_nb > self.namespace.min_similarity_lines: - yield com + yield Commonality( + cmn_lines_nb=cmn_l.effective_cmn_lines_nb, + fst_lset=lineset1, + fst_file_start=cmn_l.first_file.start, + fst_file_end=cmn_l.first_file.end, + snd_lset=lineset2, + snd_file_start=cmn_l.second_file.start, + snd_file_end=cmn_l.second_file.end, + ) def _iter_sims(self) -> Generator[Commonality]: """Iterate on similarities among all files, by making a Cartesian From bc95dfc6aa987391a0521f9fb95454245d4fe78c Mon Sep 17 00:00:00 2001 From: Pierre Sassoulas Date: Tue, 3 Mar 2026 16:29:06 +0100 Subject: [PATCH 5/7] Re-enable duplicate-code checker in stdlib primer test The duplicate-code checker was previously disabled in the stdlib primer because it was too slow. With the recent performance optimizations it might completes in reasonable time, so re-enable it. --- tests/primer/test_primer_stdlib.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/primer/test_primer_stdlib.py b/tests/primer/test_primer_stdlib.py index cd64c212b1..456a7c56a9 100644 --- a/tests/primer/test_primer_stdlib.py +++ b/tests/primer/test_primer_stdlib.py @@ -59,9 +59,8 @@ def test_primer_stdlib_no_crash( try: # We want to test all the code we can enables = ["--enable-all-extensions", "--enable=all"] - # Duplicate code takes too long and is relatively safe # We don't want to lint the test directory which are repetitive - disables = ["--disable=duplicate-code", "--ignore=test"] + disables = ["--ignore=test"] with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) Run([test_module_name, *enables, *disables]) From a8bef0e8244808a032e6a8c74d783d6308d5690e Mon Sep 17 00:00:00 2001 From: Pierre Sassoulas Date: Fri, 24 Apr 2026 16:19:04 +0200 Subject: [PATCH 6/7] Cover aligned-zip fallback in symilar's _find_common Adds a functional test that mocks `_HASH_BUCKET_PRODUCT_LIMIT` to zero and runs symilar over repeated-block content so the aligned-zip fallback path is always exercised. Covers the previously-untested branch flagged in the codecov report on #10881. Addresses Jacob's review request for test coverage on the "other form of the algorithm" introduced to cap quadratic behavior. --- tests/checkers/unittest_symilar.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/checkers/unittest_symilar.py b/tests/checkers/unittest_symilar.py index d51b070d3d..bb47694174 100644 --- a/tests/checkers/unittest_symilar.py +++ b/tests/checkers/unittest_symilar.py @@ -470,3 +470,33 @@ def test_bad_short_form_option(capsys: CaptureFixture) -> None: assert ex.value.code == 2 assert not out assert "unrecognized arguments: -j=0" in err + + +def test_hash_bucket_product_limit_fallback( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + """When a hash bucket's Cartesian product exceeds + ``_HASH_BUCKET_PRODUCT_LIMIT``, ``_find_common`` falls back to aligned-zip + pairing. Mock the limit to zero so the fallback path is always taken over + a file with repeated blocks and verify duplicate detection still reports + the expected similar lines. + + Regression test for https://github.com/pylint-dev/pylint/pull/10881. + """ + monkeypatch.setattr(symilar, "_HASH_BUCKET_PRODUCT_LIMIT", 0) + # Three copies of the same 5-line block produce hash buckets with more + # than one index, exercising the aligned-zip fallback meaningfully. + block = "a = 1\nb = 2\nc = 3\nd = 4\ne = 5\n" + file_a = tmp_path / "a.py" + file_b = tmp_path / "b.py" + file_a.write_text(block * 3) + file_b.write_text(block * 3) + + output = StringIO() + with redirect_stdout(output), pytest.raises(SystemExit) as ex: + symilar.Run([str(file_a), str(file_b)]) + assert ex.value.code == 0 + out = output.getvalue() + assert "15 similar lines in 2 files" in out + assert f"=={file_a}:[0:15]" in out + assert f"=={file_b}:[0:15]" in out From 1e003edf2fb70882682e6e028e04ac3ce5cd750a Mon Sep 17 00:00:00 2001 From: Pierre Sassoulas Date: Fri, 24 Apr 2026 16:28:53 +0200 Subject: [PATCH 7/7] Cover previously-untested dunder methods and edge cases in symilar Adds focused tests for `LineSetStartCouple.__eq__` NotImplemented branch, `LineSet.__str__` / `__getitem__` / non-LineSet `__eq__`, `append_stream` binary-without-encoding and UnicodeDecodeError paths, `report_similarities` table building, and the `process_module` deprecation warning when `linter.current_name` is None. Takes symilar.py from 96% to 99% coverage. The remaining gaps are an unreachable defensive `except KeyError` in `remove_successive` and the `if __name__ == "__main__"` guard. --- tests/checkers/unittest_symilar.py | 76 +++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/tests/checkers/unittest_symilar.py b/tests/checkers/unittest_symilar.py index bb47694174..ec13a6e91b 100644 --- a/tests/checkers/unittest_symilar.py +++ b/tests/checkers/unittest_symilar.py @@ -3,15 +3,19 @@ # Copyright (c) https://github.com/pylint-dev/pylint/blob/main/CONTRIBUTORS.txt from contextlib import redirect_stdout -from io import StringIO +from io import BytesIO, StringIO from pathlib import Path +import astroid import pytest from _pytest.capture import CaptureFixture from pylint.checkers import symilar +from pylint.checkers.symilar import LineSet, LineSetStartCouple, Symilar from pylint.lint import PyLinter +from pylint.reporters.ureports.nodes import Section, Table from pylint.testutils import GenericTestReporter as Reporter +from pylint.utils import LinterStats INPUT = Path(__file__).parent / ".." / "input" SIMILAR1 = str(INPUT / "similar1") @@ -472,6 +476,76 @@ def test_bad_short_form_option(capsys: CaptureFixture) -> None: assert "unrecognized arguments: -j=0" in err +def test_line_set_start_couple_eq_non_couple() -> None: + """``LineSetStartCouple.__eq__`` returns ``NotImplemented`` against an + object that isn't a ``LineSetStartCouple`` so Python falls back to + reflected comparison. + """ + couple = LineSetStartCouple(symilar.Index(0), symilar.Index(1)) + assert couple != object() + # pylint: disable-next=unnecessary-dunder-call + assert couple.__eq__(object()) is NotImplemented + + +def test_line_set_dunder_methods() -> None: + """Cover LineSet ``__str__``, ``__getitem__`` and non-LineSet ``__eq__``.""" + lines = ["a = 1\n", "b = 2\n", "c = 3\n"] + lineset = LineSet("fake.py", lines) + assert str(lineset) == "" + assert lineset[0].text == "a = 1" + assert (lineset == "not a lineset") is False + + +def test_append_stream_binary_requires_encoding() -> None: + """``append_stream`` raises ValueError when a binary stream is passed + without an encoding. + """ + runner = Symilar() + with pytest.raises(ValueError): + runner.append_stream("bin.py", BytesIO(b"a = 1\n")) + + +def test_report_similarities_builds_table() -> None: + """``report_similarities`` appends a stats table to the given section.""" + stats = LinterStats() + stats.reset_duplicated_lines() + section = Section() + symilar.report_similarities(section, stats, None) + assert len(section.children) == 1 + assert isinstance(section.children[0], Table) + + +def test_process_module_warns_when_current_name_is_none(tmp_path: Path) -> None: + """``SimilaritiesChecker.process_module`` warns when + ``linter.current_name`` is None (the deprecated state). + """ + linter = PyLinter(reporter=Reporter()) + linter.register_checker(symilar.SimilaritiesChecker(linter)) + checker = symilar.SimilaritiesChecker(linter) + linter.current_name = None # type: ignore[assignment] + module_file = tmp_path / "m.py" + module_file.write_text("a = 1\n") + + module = astroid.parse(module_file.read_text(), module_name="m") + module.file = str(module_file) + module.file_bytes = module_file.read_bytes() + with pytest.warns(DeprecationWarning, match="current_name attribute"): + checker.process_module(module) + + +def test_append_stream_unicode_error_yields_empty_lineset(tmp_path: Path) -> None: + """``append_stream`` swallows ``UnicodeDecodeError`` and treats the file + as empty rather than crashing. + """ + bad_file = tmp_path / "bad.py" + bad_file.write_bytes(b"\xff\xfe\xfa not valid utf-8\n") + runner = Symilar() + with bad_file.open("rb") as stream: + runner.append_stream(str(bad_file), stream, encoding="utf-8") + assert len(runner.linesets) == 1 + assert runner.linesets[0].stripped_lines == [] + + def test_hash_bucket_product_limit_fallback( monkeypatch: pytest.MonkeyPatch, tmp_path: Path ) -> None: