From 3ef0aaa04259ad40ea36a0090ddf854e74472507 Mon Sep 17 00:00:00 2001
From: Pierre Sassoulas <pierre.sassoulas@gmail.com>
Date: Sun, 1 Mar 2026 22:49:23 +0100
Subject: [PATCH 1/7] Speed up duplicate-code checker with rolling hash and
 caching

Three optimizations to the symilar checker:

- Rolling hash: compute the window hash incrementally (subtract the
  leaving line hash, add the entering one) instead of re-summing all
  k line hashes for every position.

- Cache hash_lineset results per lineset in _iter_sims: each file was
  being hashed once per pair (N-1 times) instead of once total.

- Remove the LinesChunk wrapper class and use plain int dict keys,
  so frozenset intersection and dict lookups use C-level hash/eq.

~=25% faster on astroid (17,5s => 12,5s, 25k SLOC)
~=70% faster on django (273s => 77s, 130k SLOC)
---
 doc/whatsnew/fragments/10881.performance |   5 +
 pylint/checkers/symilar.py               | 142 +++++++++++------------
 2 files changed, 70 insertions(+), 77 deletions(-)
 create mode 100644 doc/whatsnew/fragments/10881.performance
diff --git a/doc/whatsnew/fragments/10881.performance b/doc/whatsnew/fragments/10881.performance
new file mode 100644
index 0000000000..a8bfa495e6
--- /dev/null
+++ b/doc/whatsnew/fragments/10881.performance
@@ -0,0 +1,5 @@
+Speed up the ``duplicate-code`` checker by using C-based hash, a rolling hash window,
+and caching results across file pairs. Expect pylint to be ~25% faster on ~25k SLOC
+(astroid) and ~70% faster on ~130k SLOC (django) overall when duplicate-code is activated.
+
+Refs #10881
diff --git a/pylint/checkers/symilar.py b/pylint/checkers/symilar.py
index 8d504f3364..23ef700859 100644
--- a/pylint/checkers/symilar.py
+++ b/pylint/checkers/symilar.py
@@ -33,7 +33,6 @@
 import copy
 import functools
 import itertools
-import operator
 import re
 import sys
 import warnings
@@ -71,9 +70,9 @@ class LineSpecifs(NamedTuple):
     text: str
 
 
-# Links LinesChunk object to the starting indices (in lineset's stripped lines)
-# of the different chunk of lines that are used to compute the hash
-HashToIndex_T = dict["LinesChunk", list[Index]]
+# Maps the hash of successive stripped lines to the starting indices
+# (in lineset's stripped lines) of the chunks that produced that hash.
+HashToIndex_T = dict[int, list[Index]]
 
 # Links index in the lineset's stripped lines to the real lines in the file
 IndexToLines_T = dict[Index, "SuccessiveLinesLimits"]
@@ -105,45 +104,6 @@ def __init__(
 CplIndexToCplLines_T = dict["LineSetStartCouple", CplSuccessiveLinesLimits]
 
 
-class LinesChunk:
-    """The LinesChunk object computes and stores the hash of some consecutive stripped
-    lines of a lineset.
-    """
-
-    __slots__ = ("_fileid", "_hash", "_index")
-
-    def __init__(self, fileid: str, num_line: int, *lines: Iterable[str]) -> None:
-        self._fileid: str = fileid
-        """The name of the file from which the LinesChunk object is generated."""
-
-        self._index: Index = Index(num_line)
-        """The index in the stripped lines that is the starting of consecutive
-        lines.
-        """
-
-        self._hash: int = sum(hash(lin) for lin in lines)
-        """The hash of some consecutive lines."""
-
-    def __eq__(self, o: object) -> bool:
-        if not isinstance(o, LinesChunk):
-            return NotImplemented
-        return self._hash == o._hash
-
-    def __hash__(self) -> int:
-        return self._hash
-
-    def __repr__(self) -> str:
-        return (
-            f"<LinesChunk object for file {self._fileid} ({self._index}, {self._hash})>"
-        )
-
-    def __str__(self) -> str:
-        return (
-            f"LinesChunk object for file {self._fileid}, starting at line {self._index} \n"
-            f"Hash is {self._hash}"
-        )
-
-
 class SuccessiveLinesLimits:
     """A class to handle the numbering of begin and end of successive lines.
 
@@ -219,30 +179,41 @@ def hash_lineset(
     :return: a dict linking hashes to corresponding start index and a dict that links this
              index to the start and end lines in the file
     """
-    hash2index = defaultdict(list)
-    index2lines = {}
-    # Comments, docstring and other specific patterns maybe excluded -> call to stripped_lines
-    # to get only what is desired
-    lines = tuple(x.text for x in lineset.stripped_lines)
-    # Need different iterators on same lines but each one is shifted 1 from the precedent
-    shifted_lines = [iter(lines[i:]) for i in range(min_common_lines)]
-
-    for i, *succ_lines in enumerate(zip(*shifted_lines)):
-        start_linenumber = LineNumber(lineset.stripped_lines[i].line_number)
-        try:
-            end_linenumber = lineset.stripped_lines[i + min_common_lines].line_number
-        except IndexError:
-            end_linenumber = LineNumber(lineset.stripped_lines[-1].line_number + 1)
+    hash_to_index: HashToIndex_T = defaultdict(list)
+    index_to_lines: IndexToLines_T = {}
+    stripped = lineset.stripped_lines
+    num_lines = len(stripped)
+    if num_lines < min_common_lines:
+        return hash_to_index, index_to_lines
+
+    # Pre-compute per-line hashes for the rolling window
+    line_hashes = [hash(spec.text) for spec in stripped]
+
+    # Seed the rolling hash with the first window
+    window_hash = sum(line_hashes[:min_common_lines])
+
+    last_index = num_lines - 1
+    for i in range(num_lines - min_common_lines + 1):
+        start_linenumber = LineNumber(stripped[i].line_number)
+        window_end = i + min_common_lines
+        end_linenumber = (
+            stripped[window_end].line_number
+            if window_end <= last_index
+            else LineNumber(stripped[last_index].line_number + 1)
+        )
 
         index = Index(i)
-        index2lines[index] = SuccessiveLinesLimits(
+        index_to_lines[index] = SuccessiveLinesLimits(
             start=start_linenumber, end=end_linenumber
         )
 
-        l_c = LinesChunk(lineset.name, index, *succ_lines)
-        hash2index[l_c].append(index)
+        hash_to_index[window_hash].append(index)
+
+        # Slide the window: subtract the leaving line, add the entering line
+        if window_end <= last_index:
+            window_hash = window_hash - line_hashes[i] + line_hashes[window_end]
 
-    return hash2index, index2lines
+    return hash_to_index, index_to_lines
 
 
 def remove_successive(all_couples: CplIndexToCplLines_T) -> None:
@@ -465,7 +436,11 @@ def _get_similarity_report(
 
     # pylint: disable = too-many-locals
     def _find_common(
-        self, lineset1: LineSet, lineset2: LineSet
+        self,
+        lineset1: LineSet,
+        lineset2: LineSet,
+        hashes1: tuple[HashToIndex_T, IndexToLines_T] | None = None,
+        hashes2: tuple[HashToIndex_T, IndexToLines_T] | None = None,
     ) -> Generator[Commonality]:
         """Find similarities in the two given linesets.
 
@@ -483,27 +458,28 @@ def _find_common(
         hash_to_index_2: HashToIndex_T
         index_to_lines_1: IndexToLines_T
         index_to_lines_2: IndexToLines_T
-        hash_to_index_1, index_to_lines_1 = hash_lineset(
-            lineset1, self.namespace.min_similarity_lines
-        )
-        hash_to_index_2, index_to_lines_2 = hash_lineset(
-            lineset2, self.namespace.min_similarity_lines
-        )
-
-        hash_1: frozenset[LinesChunk] = frozenset(hash_to_index_1.keys())
-        hash_2: frozenset[LinesChunk] = frozenset(hash_to_index_2.keys())
+        if hashes1 is not None:
+            hash_to_index_1, index_to_lines_1 = hashes1
+        else:
+            hash_to_index_1, index_to_lines_1 = hash_lineset(
+                lineset1, self.namespace.min_similarity_lines
+            )
+        if hashes2 is not None:
+            hash_to_index_2, index_to_lines_2 = hashes2
+        else:
+            hash_to_index_2, index_to_lines_2 = hash_lineset(
+                lineset2, self.namespace.min_similarity_lines
+            )
 
-        common_hashes: Iterable[LinesChunk] = sorted(
-            hash_1 & hash_2, key=lambda m: hash_to_index_1[m][0]
-        )
+        common_hashes = hash_to_index_1.keys() & hash_to_index_2.keys()
 
         # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of
         # successive common lines, to the corresponding starting and ending number lines in both files
         all_couples: CplIndexToCplLines_T = {}
 
-        for c_hash in sorted(common_hashes, key=operator.attrgetter("_index")):
+        for chunk_hash in sorted(common_hashes, key=lambda h: hash_to_index_1[h][0]):
             for indices_in_linesets in itertools.product(
-                hash_to_index_1[c_hash], hash_to_index_2[c_hash]
+                hash_to_index_1[chunk_hash], hash_to_index_2[chunk_hash]
             ):
                 index_1 = indices_in_linesets[0]
                 index_2 = indices_in_linesets[1]
@@ -543,9 +519,21 @@ def _iter_sims(self) -> Generator[Commonality]:
         """Iterate on similarities among all files, by making a Cartesian
         product.
         """
+        min_lines = self.namespace.min_similarity_lines
+        # Cache hash_lineset results: each lineset is compared against every
+        # other, so without caching it gets hashed (N-1) times.
+        cache: dict[int, tuple[HashToIndex_T, IndexToLines_T]] = {}
         for idx, lineset in enumerate(self.linesets[:-1]):
             for lineset2 in self.linesets[idx + 1 :]:
-                yield from self._find_common(lineset, lineset2)
+                lid1 = id(lineset)
+                if lid1 not in cache:
+                    cache[lid1] = hash_lineset(lineset, min_lines)
+                lid2 = id(lineset2)
+                if lid2 not in cache:
+                    cache[lid2] = hash_lineset(lineset2, min_lines)
+                yield from self._find_common(
+                    lineset, lineset2, cache[lid1], cache[lid2]
+                )
 
     def get_map_data(self) -> list[LineSet]:
         """Returns the data we can use for a map/reduce process.

From 9b40232d2becd3f8bbad5a3c42f9fe3143f2a02e Mon Sep 17 00:00:00 2001
From: Pierre Sassoulas <pierre.sassoulas@gmail.com>
Date: Mon, 2 Mar 2026 15:35:27 +0100
Subject: [PATCH 2/7] Introduce LineSetHashResult and make hash params required
 in _find_common

Replace the raw tuple[HashToIndex_T, IndexToLines_T] with a
LineSetHashResult NamedTuple for clarity. Since _iter_sims always
passes pre-computed hashes from its cache, make the parameters
required and remove the unused fallback branches.
---
 pylint/checkers/symilar.py | 80 +++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 44 deletions(-)

diff --git a/pylint/checkers/symilar.py b/pylint/checkers/symilar.py
index 23ef700859..156da6fa21 100644
--- a/pylint/checkers/symilar.py
+++ b/pylint/checkers/symilar.py
@@ -77,6 +77,14 @@ class LineSpecifs(NamedTuple):
 # Links index in the lineset's stripped lines to the real lines in the file
 IndexToLines_T = dict[Index, "SuccessiveLinesLimits"]
 
+
+class LineSetHashResult(NamedTuple):
+    """Pre-computed hash data for a LineSet, used to speed up similarity lookup."""
+
+    hash_to_index: HashToIndex_T
+    index_to_lines: IndexToLines_T
+
+
 # The types the streams read by pylint can take. Originating from astroid.nodes.Module.stream() and open()
 STREAM_TYPES: TypeAlias = TextIO | BufferedReader | BytesIO
 
@@ -166,25 +174,25 @@ def increment(self, value: Index) -> LineSetStartCouple:
 
 def hash_lineset(
     lineset: LineSet, min_common_lines: int = DEFAULT_MIN_SIMILARITY_LINE
-) -> tuple[HashToIndex_T, IndexToLines_T]:
-    """Return two dicts.
+) -> LineSetHashResult:
+    """Return pre-computed hash data for a lineset.
 
-    The first associates the hash of successive stripped lines of a lineset
-    to the indices of the starting lines.
-    The second dict, associates the index of the starting line in the lineset's stripped lines to the
-    couple [start, end] lines number in the corresponding file.
+    The result contains two dicts:
+    - hash_to_index: maps the hash of successive stripped lines to the starting
+      indices (in lineset's stripped lines) of the chunks that produced that hash.
+    - index_to_lines: maps the index of the starting line in the lineset's stripped
+      lines to the start and end line numbers in the corresponding file.
 
     :param lineset: lineset object (i.e the lines in a file)
     :param min_common_lines: number of successive lines that are used to compute the hash
-    :return: a dict linking hashes to corresponding start index and a dict that links this
-             index to the start and end lines in the file
+    :return: a LineSetHashResult with hash-to-index and index-to-lines mappings
     """
     hash_to_index: HashToIndex_T = defaultdict(list)
     index_to_lines: IndexToLines_T = {}
     stripped = lineset.stripped_lines
     num_lines = len(stripped)
     if num_lines < min_common_lines:
-        return hash_to_index, index_to_lines
+        return LineSetHashResult(hash_to_index, index_to_lines)
 
     # Pre-compute per-line hashes for the rolling window
     line_hashes = [hash(spec.text) for spec in stripped]
@@ -213,7 +221,7 @@ def hash_lineset(
         if window_end <= last_index:
             window_hash = window_hash - line_hashes[i] + line_hashes[window_end]
 
-    return hash_to_index, index_to_lines
+    return LineSetHashResult(hash_to_index, index_to_lines)
 
 
 def remove_successive(all_couples: CplIndexToCplLines_T) -> None:
@@ -434,13 +442,12 @@ def _get_similarity_report(
         )
         return report
 
-    # pylint: disable = too-many-locals
     def _find_common(
         self,
         lineset1: LineSet,
         lineset2: LineSet,
-        hashes1: tuple[HashToIndex_T, IndexToLines_T] | None = None,
-        hashes2: tuple[HashToIndex_T, IndexToLines_T] | None = None,
+        hashes1: LineSetHashResult,
+        hashes2: LineSetHashResult,
     ) -> Generator[Commonality]:
         """Find similarities in the two given linesets.
 
@@ -454,39 +461,24 @@ def _find_common(
         account common chunk of lines that have more than the minimal number of
         successive lines required.
         """
-        hash_to_index_1: HashToIndex_T
-        hash_to_index_2: HashToIndex_T
-        index_to_lines_1: IndexToLines_T
-        index_to_lines_2: IndexToLines_T
-        if hashes1 is not None:
-            hash_to_index_1, index_to_lines_1 = hashes1
-        else:
-            hash_to_index_1, index_to_lines_1 = hash_lineset(
-                lineset1, self.namespace.min_similarity_lines
-            )
-        if hashes2 is not None:
-            hash_to_index_2, index_to_lines_2 = hashes2
-        else:
-            hash_to_index_2, index_to_lines_2 = hash_lineset(
-                lineset2, self.namespace.min_similarity_lines
-            )
-
-        common_hashes = hash_to_index_1.keys() & hash_to_index_2.keys()
+        common_hashes = hashes1.hash_to_index.keys() & hashes2.hash_to_index.keys()
 
         # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of
         # successive common lines, to the corresponding starting and ending number lines in both files
         all_couples: CplIndexToCplLines_T = {}
 
-        for chunk_hash in sorted(common_hashes, key=lambda h: hash_to_index_1[h][0]):
+        for chunk_hash in sorted(
+            common_hashes, key=lambda h: hashes1.hash_to_index[h][0]
+        ):
             for indices_in_linesets in itertools.product(
-                hash_to_index_1[chunk_hash], hash_to_index_2[chunk_hash]
+                hashes1.hash_to_index[chunk_hash], hashes2.hash_to_index[chunk_hash]
             ):
                 index_1 = indices_in_linesets[0]
                 index_2 = indices_in_linesets[1]
                 all_couples[LineSetStartCouple(index_1, index_2)] = (
                     CplSuccessiveLinesLimits(
-                        copy.copy(index_to_lines_1[index_1]),
-                        copy.copy(index_to_lines_2[index_2]),
+                        copy.copy(hashes1.index_to_lines[index_1]),
+                        copy.copy(hashes2.index_to_lines[index_2]),
                         effective_cmn_lines_nb=self.namespace.min_similarity_lines,
                     )
                 )
@@ -522,17 +514,17 @@ def _iter_sims(self) -> Generator[Commonality]:
         min_lines = self.namespace.min_similarity_lines
         # Cache hash_lineset results: each lineset is compared against every
         # other, so without caching it gets hashed (N-1) times.
-        cache: dict[int, tuple[HashToIndex_T, IndexToLines_T]] = {}
-        for idx, lineset in enumerate(self.linesets[:-1]):
+        cache: dict[int, LineSetHashResult] = {}
+        for idx, lineset1 in enumerate(self.linesets[:-1]):
             for lineset2 in self.linesets[idx + 1 :]:
-                lid1 = id(lineset)
-                if lid1 not in cache:
-                    cache[lid1] = hash_lineset(lineset, min_lines)
-                lid2 = id(lineset2)
-                if lid2 not in cache:
-                    cache[lid2] = hash_lineset(lineset2, min_lines)
+                key1 = id(lineset1)
+                if key1 not in cache:
+                    cache[key1] = hash_lineset(lineset1, min_lines)
+                key2 = id(lineset2)
+                if key2 not in cache:
+                    cache[key2] = hash_lineset(lineset2, min_lines)
                 yield from self._find_common(
-                    lineset, lineset2, cache[lid1], cache[lid2]
+                    lineset1, lineset2, cache[key1], cache[key2]
                 )
 
     def get_map_data(self) -> list[LineSet]:

From 8027e6cc194f7ac9d6df892a0f89eabf227fd257 Mon Sep 17 00:00:00 2001
From: Pierre Sassoulas <pierre.sassoulas@gmail.com>
Date: Tue, 3 Mar 2026 09:44:44 +0100
Subject: [PATCH 3/7] Reuse parsed AST in stripped_lines to avoid redundant
 astroid.parse()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

process_module already receives the parsed nodes.Module from pylint's
main pass, but stripped_lines was re-parsing every file from source text.
Thread the existing AST through process_module → append_stream → LineSet
→ stripped_lines, falling back to astroid.parse() only when no tree is
provided (standalone symilar CLI).

The redundant parse dominated stripped_lines cost.  Per-file savings:

  file size     time saved    memory saved
  0 lines          0.14 ms       0.03 MB
  924 lines       65    ms       2.1  MB
  31k lines     2764    ms     101.6  MB

End-to-end on pylint's own codebase (179 files, ~49k SLOC):

  before  median=6.6s  peak RSS=170 MB
  after   median=5.1s  peak RSS=149 MB  (1.5x faster, -12% memory)
---
 pylint/checkers/symilar.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/pylint/checkers/symilar.py b/pylint/checkers/symilar.py
index 156da6fa21..e87fa07497 100644
--- a/pylint/checkers/symilar.py
+++ b/pylint/checkers/symilar.py
@@ -336,7 +336,11 @@ def __init__(
         self.linesets: list[LineSet] = []
 
     def append_stream(
-        self, streamid: str, stream: STREAM_TYPES, encoding: str | None = None
+        self,
+        streamid: str,
+        stream: STREAM_TYPES,
+        encoding: str | None = None,
+        tree: nodes.Module | None = None,
     ) -> None:
         """Append a file to search for similarities."""
         if isinstance(stream, BufferedIOBase):
@@ -365,6 +369,7 @@ def append_stream(
                     if hasattr(self, "linter")
                     else None
                 ),
+                tree=tree,
             )
         )
 
@@ -550,6 +555,7 @@ def stripped_lines(
     ignore_imports: bool,
     ignore_signatures: bool,
     line_enabled_callback: Callable[[str, int], bool] | None = None,
+    tree: nodes.Module | None = None,
 ) -> list[LineSpecifs]:
     """Return tuples of line/line number/line type with leading/trailing white-space and
     any ignored code features removed.
@@ -561,11 +567,13 @@ def stripped_lines(
     :param ignore_signatures: if true, any line that is part of a function signature is removed from the result
     :param line_enabled_callback: If called with "R0801" and a line number, a return value of False will disregard
            the line
+    :param tree: pre-parsed AST; when provided the redundant astroid.parse() call is skipped
     :return: the collection of line/line number/line type tuples
     """
     ignore_lines: set[int] = set()
     if ignore_imports or ignore_signatures:
-        tree = astroid.parse("".join(lines))
+        if tree is None:
+            tree = astroid.parse("".join(lines))
         if ignore_imports:
             ignore_lines.update(
                 chain.from_iterable(
@@ -654,6 +662,7 @@ def __init__(
         ignore_imports: bool = False,
         ignore_signatures: bool = False,
         line_enabled_callback: Callable[[str, int], bool] | None = None,
+        tree: nodes.Module | None = None,
     ) -> None:
         self.name = name
         self._real_lines = lines
@@ -664,6 +673,7 @@ def __init__(
             ignore_imports,
             ignore_signatures,
             line_enabled_callback=line_enabled_callback,
+            tree=tree,
         )
 
     def __str__(self) -> str:
@@ -816,7 +826,9 @@ def process_module(self, node: nodes.Module) -> None:
                 stacklevel=2,
             )
         with node.stream() as stream:
-            self.append_stream(self.linter.current_name, stream, node.file_encoding)
+            self.append_stream(
+                self.linter.current_name, stream, node.file_encoding, tree=node
+            )
 
     def close(self) -> None:
         """Compute and display similarities on closing (i.e. end of parsing)."""

From dae5b833ceb0fb24a04f56f84a77521d9af8da86 Mon Sep 17 00:00:00 2001
From: Pierre Sassoulas <pierre.sassoulas@gmail.com>
Date: Tue, 3 Mar 2026 10:15:24 +0100
Subject: [PATCH 4/7] Cap hash bucket Cartesian product to prevent quadratic
 blow-up

When two files share a hash bucket with many indices (e.g. thousands of
identical data lines), itertools.product explodes quadratically (4k x 22k
= 88M pairs).  Fall back to aligned zip pairing when the product exceeds
500 entries.  The diagonal pairs are consecutive so remove_successive
still coalesces them into one correct block.

Benchmark on psf/black (316 files, 129k lines including pathological
profiling/ data files with 22k+ identical lines): hung forever -> 7.2s.
---
 doc/whatsnew/4/4.1/index.rst             |  5 +++
 doc/whatsnew/fragments/10881.performance | 15 +++++--
 pylint/checkers/symilar.py               | 51 +++++++++++++++---------
 3 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/doc/whatsnew/4/4.1/index.rst b/doc/whatsnew/4/4.1/index.rst
index 9fb075b671..b6c65db818 100644
--- a/doc/whatsnew/4/4.1/index.rst
+++ b/doc/whatsnew/4/4.1/index.rst
@@ -12,6 +12,11 @@
 Summary -- Release highlights
 =============================
 
+The duplicate-code checker and ``symilar`` received optimizations that
+result in considerable performance improvements and memory use reduction
+on larger codebases. For example, pandas analysis went from 20 min to
+55 s and pylint does not get OOM-killed when analyzing cpython anymore.
+
 The required ``astroid`` version is now 4.1.1. See the
 `astroid changelog <https://pylint.readthedocs.io/projects/astroid/en/latest/changelog.html#what-s-new-in-astroid-4-1-0>`_
 for additional fixes, features, and performance improvements applicable to pylint.
diff --git a/doc/whatsnew/fragments/10881.performance b/doc/whatsnew/fragments/10881.performance
index a8bfa495e6..f9ced17eaf 100644
--- a/doc/whatsnew/fragments/10881.performance
+++ b/doc/whatsnew/fragments/10881.performance
@@ -1,5 +1,14 @@
-Speed up the ``duplicate-code`` checker by using C-based hash, a rolling hash window,
-and caching results across file pairs. Expect pylint to be ~25% faster on ~25k SLOC
-(astroid) and ~70% faster on ~130k SLOC (django) overall when duplicate-code is activated.
+Sped up the ``duplicate-code`` checker.  When run inside pylint the
+checker now reuses the already-parsed AST instead of re-parsing every
+file like it has to do when launched via ``symilar``, and it uses a
+rolling hash window with caching across file pairs. Additionally, a
+quadratic blow-up in the hash-matching phase is avoided by switching
+algorithm at a threshold, which previously caused the checker to hang
+on files with many repeated lines.
+
+Speedup scales with codebase size from 1.5x on small projects
+(~10k lines), to 20x on large ones (500k+ lines). Memory usage also
+drops 12-27%. Codebases that previously hung or were OOM-killed could
+now complete.
 
 Refs #10881
diff --git a/pylint/checkers/symilar.py b/pylint/checkers/symilar.py
index e87fa07497..e986ab1fd5 100644
--- a/pylint/checkers/symilar.py
+++ b/pylint/checkers/symilar.py
@@ -57,6 +57,12 @@
 
 REGEX_FOR_LINES_WITH_CONTENT = re.compile(r".*\w+")
 
+# When two files share a hash bucket whose Cartesian product exceeds this
+# limit, fall back to aligned (zip) pairing instead of the full product.
+# This prevents quadratic blow-up on files with many identical lines (e.g.
+# auto-generated data).  The result is a correct lower bound on duplicates.
+_HASH_BUCKET_PRODUCT_LIMIT: int = 500
+
 # Index defines a location in a LineSet stripped lines collection
 Index = NewType("Index", int)
 
@@ -475,11 +481,19 @@ def _find_common(
         for chunk_hash in sorted(
             common_hashes, key=lambda h: hashes1.hash_to_index[h][0]
         ):
-            for indices_in_linesets in itertools.product(
-                hashes1.hash_to_index[chunk_hash], hashes2.hash_to_index[chunk_hash]
-            ):
-                index_1 = indices_in_linesets[0]
-                index_2 = indices_in_linesets[1]
+            indices_1 = hashes1.hash_to_index[chunk_hash]
+            indices_2 = hashes2.hash_to_index[chunk_hash]
+
+            # When both buckets are large the Cartesian product becomes
+            # quadratic (e.g. 4000 x 22000 = 88M pairs for repeated data
+            # lines).  Fall back to aligned pairing which is O(min(N, M))
+            # and still lets remove_successive coalesce consecutive matches.
+            if len(indices_1) * len(indices_2) > _HASH_BUCKET_PRODUCT_LIMIT:
+                pairs: Iterable[tuple[Index, Index]] = zip(indices_1, indices_2)
+            else:
+                pairs = itertools.product(indices_1, indices_2)
+
+            for index_1, index_2 in pairs:
                 all_couples[LineSetStartCouple(index_1, index_2)] = (
                     CplSuccessiveLinesLimits(
                         copy.copy(hashes1.index_to_lines[index_1]),
@@ -493,24 +507,25 @@ def _find_common(
         for cml_stripped_l, cmn_l in all_couples.items():
             start_index_1 = cml_stripped_l.fst_lineset_index
             start_index_2 = cml_stripped_l.snd_lineset_index
-            nb_common_lines = cmn_l.effective_cmn_lines_nb
-
-            com = Commonality(
-                cmn_lines_nb=nb_common_lines,
-                fst_lset=lineset1,
-                fst_file_start=cmn_l.first_file.start,
-                fst_file_end=cmn_l.first_file.end,
-                snd_lset=lineset2,
-                snd_file_start=cmn_l.second_file.start,
-                snd_file_end=cmn_l.second_file.end,
-            )
 
             eff_cmn_nb = filter_noncode_lines(
-                lineset1, start_index_1, lineset2, start_index_2, nb_common_lines
+                lineset1,
+                start_index_1,
+                lineset2,
+                start_index_2,
+                cmn_l.effective_cmn_lines_nb,
             )
 
             if eff_cmn_nb > self.namespace.min_similarity_lines:
-                yield com
+                yield Commonality(
+                    cmn_lines_nb=cmn_l.effective_cmn_lines_nb,
+                    fst_lset=lineset1,
+                    fst_file_start=cmn_l.first_file.start,
+                    fst_file_end=cmn_l.first_file.end,
+                    snd_lset=lineset2,
+                    snd_file_start=cmn_l.second_file.start,
+                    snd_file_end=cmn_l.second_file.end,
+                )
 
     def _iter_sims(self) -> Generator[Commonality]:
         """Iterate on similarities among all files, by making a Cartesian

From bc95dfc6aa987391a0521f9fb95454245d4fe78c Mon Sep 17 00:00:00 2001
From: Pierre Sassoulas <pierre.sassoulas@gmail.com>
Date: Tue, 3 Mar 2026 16:29:06 +0100
Subject: [PATCH 5/7] Re-enable duplicate-code checker in stdlib primer test

The duplicate-code checker was previously disabled in the stdlib primer
because it was too slow. With the recent performance optimizations it
might completes in reasonable time, so re-enable it.
---
 tests/primer/test_primer_stdlib.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/primer/test_primer_stdlib.py b/tests/primer/test_primer_stdlib.py
index cd64c212b1..456a7c56a9 100644
--- a/tests/primer/test_primer_stdlib.py
+++ b/tests/primer/test_primer_stdlib.py
@@ -59,9 +59,8 @@ def test_primer_stdlib_no_crash(
         try:
             # We want to test all the code we can
             enables = ["--enable-all-extensions", "--enable=all"]
-            # Duplicate code takes too long and is relatively safe
             # We don't want to lint the test directory which are repetitive
-            disables = ["--disable=duplicate-code", "--ignore=test"]
+            disables = ["--ignore=test"]
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore", category=UserWarning)
                 Run([test_module_name, *enables, *disables])

From a8bef0e8244808a032e6a8c74d783d6308d5690e Mon Sep 17 00:00:00 2001
From: Pierre Sassoulas <pierre.sassoulas@gmail.com>
Date: Fri, 24 Apr 2026 16:19:04 +0200
Subject: [PATCH 6/7] Cover aligned-zip fallback in symilar's _find_common

Adds a functional test that mocks `_HASH_BUCKET_PRODUCT_LIMIT` to zero and
runs symilar over repeated-block content so the aligned-zip fallback path
is always exercised. Covers the previously-untested branch flagged in the
codecov report on #10881.

Addresses Jacob's review request for test coverage on the "other form of
the algorithm" introduced to cap quadratic behavior.
---
 tests/checkers/unittest_symilar.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tests/checkers/unittest_symilar.py b/tests/checkers/unittest_symilar.py
index d51b070d3d..bb47694174 100644
--- a/tests/checkers/unittest_symilar.py
+++ b/tests/checkers/unittest_symilar.py
@@ -470,3 +470,33 @@ def test_bad_short_form_option(capsys: CaptureFixture) -> None:
     assert ex.value.code == 2
     assert not out
     assert "unrecognized arguments: -j=0" in err
+
+
+def test_hash_bucket_product_limit_fallback(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    """When a hash bucket's Cartesian product exceeds
+    ``_HASH_BUCKET_PRODUCT_LIMIT``, ``_find_common`` falls back to aligned-zip
+    pairing. Mock the limit to zero so the fallback path is always taken over
+    a file with repeated blocks and verify duplicate detection still reports
+    the expected similar lines.
+
+    Regression test for https://github.com/pylint-dev/pylint/pull/10881.
+    """
+    monkeypatch.setattr(symilar, "_HASH_BUCKET_PRODUCT_LIMIT", 0)
+    # Three copies of the same 5-line block produce hash buckets with more
+    # than one index, exercising the aligned-zip fallback meaningfully.
+    block = "a = 1\nb = 2\nc = 3\nd = 4\ne = 5\n"
+    file_a = tmp_path / "a.py"
+    file_b = tmp_path / "b.py"
+    file_a.write_text(block * 3)
+    file_b.write_text(block * 3)
+
+    output = StringIO()
+    with redirect_stdout(output), pytest.raises(SystemExit) as ex:
+        symilar.Run([str(file_a), str(file_b)])
+    assert ex.value.code == 0
+    out = output.getvalue()
+    assert "15 similar lines in 2 files" in out
+    assert f"=={file_a}:[0:15]" in out
+    assert f"=={file_b}:[0:15]" in out

From 1e003edf2fb70882682e6e028e04ac3ce5cd750a Mon Sep 17 00:00:00 2001
From: Pierre Sassoulas <pierre.sassoulas@gmail.com>
Date: Fri, 24 Apr 2026 16:28:53 +0200
Subject: [PATCH 7/7] Cover previously-untested dunder methods and edge cases
 in symilar

Adds focused tests for `LineSetStartCouple.__eq__` NotImplemented branch,
`LineSet.__str__` / `__getitem__` / non-LineSet `__eq__`, `append_stream`
binary-without-encoding and UnicodeDecodeError paths, `report_similarities`
table building, and the `process_module` deprecation warning when
`linter.current_name` is None.

Takes symilar.py from 96% to 99% coverage. The remaining gaps are an
unreachable defensive `except KeyError` in `remove_successive` and the
`if __name__ == "__main__"` guard.
---
 tests/checkers/unittest_symilar.py | 76 +++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/tests/checkers/unittest_symilar.py b/tests/checkers/unittest_symilar.py
index bb47694174..ec13a6e91b 100644
--- a/tests/checkers/unittest_symilar.py
+++ b/tests/checkers/unittest_symilar.py
@@ -3,15 +3,19 @@
 # Copyright (c) https://github.com/pylint-dev/pylint/blob/main/CONTRIBUTORS.txt
 
 from contextlib import redirect_stdout
-from io import StringIO
+from io import BytesIO, StringIO
 from pathlib import Path
 
+import astroid
 import pytest
 from _pytest.capture import CaptureFixture
 
 from pylint.checkers import symilar
+from pylint.checkers.symilar import LineSet, LineSetStartCouple, Symilar
 from pylint.lint import PyLinter
+from pylint.reporters.ureports.nodes import Section, Table
 from pylint.testutils import GenericTestReporter as Reporter
+from pylint.utils import LinterStats
 
 INPUT = Path(__file__).parent / ".." / "input"
 SIMILAR1 = str(INPUT / "similar1")
@@ -472,6 +476,76 @@ def test_bad_short_form_option(capsys: CaptureFixture) -> None:
     assert "unrecognized arguments: -j=0" in err
 
 
+def test_line_set_start_couple_eq_non_couple() -> None:
+    """``LineSetStartCouple.__eq__`` returns ``NotImplemented`` against an
+    object that isn't a ``LineSetStartCouple`` so Python falls back to
+    reflected comparison.
+    """
+    couple = LineSetStartCouple(symilar.Index(0), symilar.Index(1))
+    assert couple != object()
+    # pylint: disable-next=unnecessary-dunder-call
+    assert couple.__eq__(object()) is NotImplemented
+
+
+def test_line_set_dunder_methods() -> None:
+    """Cover LineSet ``__str__``, ``__getitem__`` and non-LineSet ``__eq__``."""
+    lines = ["a = 1\n", "b = 2\n", "c = 3\n"]
+    lineset = LineSet("fake.py", lines)
+    assert str(lineset) == "<Lineset for fake.py>"
+    assert lineset[0].text == "a = 1"
+    assert (lineset == "not a lineset") is False
+
+
+def test_append_stream_binary_requires_encoding() -> None:
+    """``append_stream`` raises ValueError when a binary stream is passed
+    without an encoding.
+    """
+    runner = Symilar()
+    with pytest.raises(ValueError):
+        runner.append_stream("bin.py", BytesIO(b"a = 1\n"))
+
+
+def test_report_similarities_builds_table() -> None:
+    """``report_similarities`` appends a stats table to the given section."""
+    stats = LinterStats()
+    stats.reset_duplicated_lines()
+    section = Section()
+    symilar.report_similarities(section, stats, None)
+    assert len(section.children) == 1
+    assert isinstance(section.children[0], Table)
+
+
+def test_process_module_warns_when_current_name_is_none(tmp_path: Path) -> None:
+    """``SimilaritiesChecker.process_module`` warns when
+    ``linter.current_name`` is None (the deprecated state).
+    """
+    linter = PyLinter(reporter=Reporter())
+    linter.register_checker(symilar.SimilaritiesChecker(linter))
+    checker = symilar.SimilaritiesChecker(linter)
+    linter.current_name = None  # type: ignore[assignment]
+    module_file = tmp_path / "m.py"
+    module_file.write_text("a = 1\n")
+
+    module = astroid.parse(module_file.read_text(), module_name="m")
+    module.file = str(module_file)
+    module.file_bytes = module_file.read_bytes()
+    with pytest.warns(DeprecationWarning, match="current_name attribute"):
+        checker.process_module(module)
+
+
+def test_append_stream_unicode_error_yields_empty_lineset(tmp_path: Path) -> None:
+    """``append_stream`` swallows ``UnicodeDecodeError`` and treats the file
+    as empty rather than crashing.
+    """
+    bad_file = tmp_path / "bad.py"
+    bad_file.write_bytes(b"\xff\xfe\xfa not valid utf-8\n")
+    runner = Symilar()
+    with bad_file.open("rb") as stream:
+        runner.append_stream(str(bad_file), stream, encoding="utf-8")
+    assert len(runner.linesets) == 1
+    assert runner.linesets[0].stripped_lines == []
+
+
 def test_hash_bucket_product_limit_fallback(
     monkeypatch: pytest.MonkeyPatch, tmp_path: Path
 ) -> None: