diff --git a/doc/whatsnew/4/4.1/index.rst b/doc/whatsnew/4/4.1/index.rst index 9fb075b671..b6c65db818 100644 --- a/doc/whatsnew/4/4.1/index.rst +++ b/doc/whatsnew/4/4.1/index.rst @@ -12,6 +12,11 @@ Summary -- Release highlights ============================= +The duplicate-code checker and ``symilar`` received optimizations that +result in considerable performance improvements and memory use reduction +on larger codebases. For example, pandas analysis went from 20 min to +55 s and pylint does not get OOM-killed when analyzing cpython anymore. + The required ``astroid`` version is now 4.1.1. See the `astroid changelog `_ for additional fixes, features, and performance improvements applicable to pylint. diff --git a/doc/whatsnew/fragments/10881.performance b/doc/whatsnew/fragments/10881.performance new file mode 100644 index 0000000000..f9ced17eaf --- /dev/null +++ b/doc/whatsnew/fragments/10881.performance @@ -0,0 +1,14 @@ +Sped up the ``duplicate-code`` checker. When run inside pylint the +checker now reuses the already-parsed AST instead of re-parsing every +file like it has to do when launched via ``symilar``, and it uses a +rolling hash window with caching across file pairs. Additionally, a +quadratic blow-up in the hash-matching phase is avoided by switching +algorithm at a threshold, which previously caused the checker to hang +on files with many repeated lines. + +Speedup scales with codebase size from 1.5x on small projects +(~10k lines), to 20x on large ones (500k+ lines). Memory usage also +drops 12-27%. Codebases that previously hung or were OOM-killed could +now complete. + +Refs #10881 diff --git a/pylint/checkers/symilar.py b/pylint/checkers/symilar.py index 8d504f3364..e986ab1fd5 100644 --- a/pylint/checkers/symilar.py +++ b/pylint/checkers/symilar.py @@ -33,7 +33,6 @@ import copy import functools import itertools -import operator import re import sys import warnings @@ -58,6 +57,12 @@ REGEX_FOR_LINES_WITH_CONTENT = re.compile(r".*\w+") +# When two files share a hash bucket whose Cartesian product exceeds this +# limit, fall back to aligned (zip) pairing instead of the full product. +# This prevents quadratic blow-up on files with many identical lines (e.g. +# auto-generated data). The result is a correct lower bound on duplicates. +_HASH_BUCKET_PRODUCT_LIMIT: int = 500 + # Index defines a location in a LineSet stripped lines collection Index = NewType("Index", int) @@ -71,13 +76,21 @@ class LineSpecifs(NamedTuple): text: str -# Links LinesChunk object to the starting indices (in lineset's stripped lines) -# of the different chunk of lines that are used to compute the hash -HashToIndex_T = dict["LinesChunk", list[Index]] +# Maps the hash of successive stripped lines to the starting indices +# (in lineset's stripped lines) of the chunks that produced that hash. +HashToIndex_T = dict[int, list[Index]] # Links index in the lineset's stripped lines to the real lines in the file IndexToLines_T = dict[Index, "SuccessiveLinesLimits"] + +class LineSetHashResult(NamedTuple): + """Pre-computed hash data for a LineSet, used to speed up similarity lookup.""" + + hash_to_index: HashToIndex_T + index_to_lines: IndexToLines_T + + # The types the streams read by pylint can take. Originating from astroid.nodes.Module.stream() and open() STREAM_TYPES: TypeAlias = TextIO | BufferedReader | BytesIO @@ -105,45 +118,6 @@ def __init__( CplIndexToCplLines_T = dict["LineSetStartCouple", CplSuccessiveLinesLimits] -class LinesChunk: - """The LinesChunk object computes and stores the hash of some consecutive stripped - lines of a lineset. - """ - - __slots__ = ("_fileid", "_hash", "_index") - - def __init__(self, fileid: str, num_line: int, *lines: Iterable[str]) -> None: - self._fileid: str = fileid - """The name of the file from which the LinesChunk object is generated.""" - - self._index: Index = Index(num_line) - """The index in the stripped lines that is the starting of consecutive - lines. - """ - - self._hash: int = sum(hash(lin) for lin in lines) - """The hash of some consecutive lines.""" - - def __eq__(self, o: object) -> bool: - if not isinstance(o, LinesChunk): - return NotImplemented - return self._hash == o._hash - - def __hash__(self) -> int: - return self._hash - - def __repr__(self) -> str: - return ( - f"" - ) - - def __str__(self) -> str: - return ( - f"LinesChunk object for file {self._fileid}, starting at line {self._index} \n" - f"Hash is {self._hash}" - ) - - class SuccessiveLinesLimits: """A class to handle the numbering of begin and end of successive lines. @@ -206,43 +180,54 @@ def increment(self, value: Index) -> LineSetStartCouple: def hash_lineset( lineset: LineSet, min_common_lines: int = DEFAULT_MIN_SIMILARITY_LINE -) -> tuple[HashToIndex_T, IndexToLines_T]: - """Return two dicts. +) -> LineSetHashResult: + """Return pre-computed hash data for a lineset. - The first associates the hash of successive stripped lines of a lineset - to the indices of the starting lines. - The second dict, associates the index of the starting line in the lineset's stripped lines to the - couple [start, end] lines number in the corresponding file. + The result contains two dicts: + - hash_to_index: maps the hash of successive stripped lines to the starting + indices (in lineset's stripped lines) of the chunks that produced that hash. + - index_to_lines: maps the index of the starting line in the lineset's stripped + lines to the start and end line numbers in the corresponding file. :param lineset: lineset object (i.e the lines in a file) :param min_common_lines: number of successive lines that are used to compute the hash - :return: a dict linking hashes to corresponding start index and a dict that links this - index to the start and end lines in the file + :return: a LineSetHashResult with hash-to-index and index-to-lines mappings """ - hash2index = defaultdict(list) - index2lines = {} - # Comments, docstring and other specific patterns maybe excluded -> call to stripped_lines - # to get only what is desired - lines = tuple(x.text for x in lineset.stripped_lines) - # Need different iterators on same lines but each one is shifted 1 from the precedent - shifted_lines = [iter(lines[i:]) for i in range(min_common_lines)] - - for i, *succ_lines in enumerate(zip(*shifted_lines)): - start_linenumber = LineNumber(lineset.stripped_lines[i].line_number) - try: - end_linenumber = lineset.stripped_lines[i + min_common_lines].line_number - except IndexError: - end_linenumber = LineNumber(lineset.stripped_lines[-1].line_number + 1) + hash_to_index: HashToIndex_T = defaultdict(list) + index_to_lines: IndexToLines_T = {} + stripped = lineset.stripped_lines + num_lines = len(stripped) + if num_lines < min_common_lines: + return LineSetHashResult(hash_to_index, index_to_lines) + + # Pre-compute per-line hashes for the rolling window + line_hashes = [hash(spec.text) for spec in stripped] + + # Seed the rolling hash with the first window + window_hash = sum(line_hashes[:min_common_lines]) + + last_index = num_lines - 1 + for i in range(num_lines - min_common_lines + 1): + start_linenumber = LineNumber(stripped[i].line_number) + window_end = i + min_common_lines + end_linenumber = ( + stripped[window_end].line_number + if window_end <= last_index + else LineNumber(stripped[last_index].line_number + 1) + ) index = Index(i) - index2lines[index] = SuccessiveLinesLimits( + index_to_lines[index] = SuccessiveLinesLimits( start=start_linenumber, end=end_linenumber ) - l_c = LinesChunk(lineset.name, index, *succ_lines) - hash2index[l_c].append(index) + hash_to_index[window_hash].append(index) - return hash2index, index2lines + # Slide the window: subtract the leaving line, add the entering line + if window_end <= last_index: + window_hash = window_hash - line_hashes[i] + line_hashes[window_end] + + return LineSetHashResult(hash_to_index, index_to_lines) def remove_successive(all_couples: CplIndexToCplLines_T) -> None: @@ -357,7 +342,11 @@ def __init__( self.linesets: list[LineSet] = [] def append_stream( - self, streamid: str, stream: STREAM_TYPES, encoding: str | None = None + self, + streamid: str, + stream: STREAM_TYPES, + encoding: str | None = None, + tree: nodes.Module | None = None, ) -> None: """Append a file to search for similarities.""" if isinstance(stream, BufferedIOBase): @@ -386,6 +375,7 @@ def append_stream( if hasattr(self, "linter") else None ), + tree=tree, ) ) @@ -463,9 +453,12 @@ def _get_similarity_report( ) return report - # pylint: disable = too-many-locals def _find_common( - self, lineset1: LineSet, lineset2: LineSet + self, + lineset1: LineSet, + lineset2: LineSet, + hashes1: LineSetHashResult, + hashes2: LineSetHashResult, ) -> Generator[Commonality]: """Find similarities in the two given linesets. @@ -479,38 +472,32 @@ def _find_common( account common chunk of lines that have more than the minimal number of successive lines required. """ - hash_to_index_1: HashToIndex_T - hash_to_index_2: HashToIndex_T - index_to_lines_1: IndexToLines_T - index_to_lines_2: IndexToLines_T - hash_to_index_1, index_to_lines_1 = hash_lineset( - lineset1, self.namespace.min_similarity_lines - ) - hash_to_index_2, index_to_lines_2 = hash_lineset( - lineset2, self.namespace.min_similarity_lines - ) - - hash_1: frozenset[LinesChunk] = frozenset(hash_to_index_1.keys()) - hash_2: frozenset[LinesChunk] = frozenset(hash_to_index_2.keys()) - - common_hashes: Iterable[LinesChunk] = sorted( - hash_1 & hash_2, key=lambda m: hash_to_index_1[m][0] - ) + common_hashes = hashes1.hash_to_index.keys() & hashes2.hash_to_index.keys() # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of # successive common lines, to the corresponding starting and ending number lines in both files all_couples: CplIndexToCplLines_T = {} - for c_hash in sorted(common_hashes, key=operator.attrgetter("_index")): - for indices_in_linesets in itertools.product( - hash_to_index_1[c_hash], hash_to_index_2[c_hash] - ): - index_1 = indices_in_linesets[0] - index_2 = indices_in_linesets[1] + for chunk_hash in sorted( + common_hashes, key=lambda h: hashes1.hash_to_index[h][0] + ): + indices_1 = hashes1.hash_to_index[chunk_hash] + indices_2 = hashes2.hash_to_index[chunk_hash] + + # When both buckets are large the Cartesian product becomes + # quadratic (e.g. 4000 x 22000 = 88M pairs for repeated data + # lines). Fall back to aligned pairing which is O(min(N, M)) + # and still lets remove_successive coalesce consecutive matches. + if len(indices_1) * len(indices_2) > _HASH_BUCKET_PRODUCT_LIMIT: + pairs: Iterable[tuple[Index, Index]] = zip(indices_1, indices_2) + else: + pairs = itertools.product(indices_1, indices_2) + + for index_1, index_2 in pairs: all_couples[LineSetStartCouple(index_1, index_2)] = ( CplSuccessiveLinesLimits( - copy.copy(index_to_lines_1[index_1]), - copy.copy(index_to_lines_2[index_2]), + copy.copy(hashes1.index_to_lines[index_1]), + copy.copy(hashes2.index_to_lines[index_2]), effective_cmn_lines_nb=self.namespace.min_similarity_lines, ) ) @@ -520,32 +507,45 @@ def _find_common( for cml_stripped_l, cmn_l in all_couples.items(): start_index_1 = cml_stripped_l.fst_lineset_index start_index_2 = cml_stripped_l.snd_lineset_index - nb_common_lines = cmn_l.effective_cmn_lines_nb - - com = Commonality( - cmn_lines_nb=nb_common_lines, - fst_lset=lineset1, - fst_file_start=cmn_l.first_file.start, - fst_file_end=cmn_l.first_file.end, - snd_lset=lineset2, - snd_file_start=cmn_l.second_file.start, - snd_file_end=cmn_l.second_file.end, - ) eff_cmn_nb = filter_noncode_lines( - lineset1, start_index_1, lineset2, start_index_2, nb_common_lines + lineset1, + start_index_1, + lineset2, + start_index_2, + cmn_l.effective_cmn_lines_nb, ) if eff_cmn_nb > self.namespace.min_similarity_lines: - yield com + yield Commonality( + cmn_lines_nb=cmn_l.effective_cmn_lines_nb, + fst_lset=lineset1, + fst_file_start=cmn_l.first_file.start, + fst_file_end=cmn_l.first_file.end, + snd_lset=lineset2, + snd_file_start=cmn_l.second_file.start, + snd_file_end=cmn_l.second_file.end, + ) def _iter_sims(self) -> Generator[Commonality]: """Iterate on similarities among all files, by making a Cartesian product. """ - for idx, lineset in enumerate(self.linesets[:-1]): + min_lines = self.namespace.min_similarity_lines + # Cache hash_lineset results: each lineset is compared against every + # other, so without caching it gets hashed (N-1) times. + cache: dict[int, LineSetHashResult] = {} + for idx, lineset1 in enumerate(self.linesets[:-1]): for lineset2 in self.linesets[idx + 1 :]: - yield from self._find_common(lineset, lineset2) + key1 = id(lineset1) + if key1 not in cache: + cache[key1] = hash_lineset(lineset1, min_lines) + key2 = id(lineset2) + if key2 not in cache: + cache[key2] = hash_lineset(lineset2, min_lines) + yield from self._find_common( + lineset1, lineset2, cache[key1], cache[key2] + ) def get_map_data(self) -> list[LineSet]: """Returns the data we can use for a map/reduce process. @@ -570,6 +570,7 @@ def stripped_lines( ignore_imports: bool, ignore_signatures: bool, line_enabled_callback: Callable[[str, int], bool] | None = None, + tree: nodes.Module | None = None, ) -> list[LineSpecifs]: """Return tuples of line/line number/line type with leading/trailing white-space and any ignored code features removed. @@ -581,11 +582,13 @@ def stripped_lines( :param ignore_signatures: if true, any line that is part of a function signature is removed from the result :param line_enabled_callback: If called with "R0801" and a line number, a return value of False will disregard the line + :param tree: pre-parsed AST; when provided the redundant astroid.parse() call is skipped :return: the collection of line/line number/line type tuples """ ignore_lines: set[int] = set() if ignore_imports or ignore_signatures: - tree = astroid.parse("".join(lines)) + if tree is None: + tree = astroid.parse("".join(lines)) if ignore_imports: ignore_lines.update( chain.from_iterable( @@ -674,6 +677,7 @@ def __init__( ignore_imports: bool = False, ignore_signatures: bool = False, line_enabled_callback: Callable[[str, int], bool] | None = None, + tree: nodes.Module | None = None, ) -> None: self.name = name self._real_lines = lines @@ -684,6 +688,7 @@ def __init__( ignore_imports, ignore_signatures, line_enabled_callback=line_enabled_callback, + tree=tree, ) def __str__(self) -> str: @@ -836,7 +841,9 @@ def process_module(self, node: nodes.Module) -> None: stacklevel=2, ) with node.stream() as stream: - self.append_stream(self.linter.current_name, stream, node.file_encoding) + self.append_stream( + self.linter.current_name, stream, node.file_encoding, tree=node + ) def close(self) -> None: """Compute and display similarities on closing (i.e. end of parsing).""" diff --git a/tests/checkers/unittest_symilar.py b/tests/checkers/unittest_symilar.py index d51b070d3d..ec13a6e91b 100644 --- a/tests/checkers/unittest_symilar.py +++ b/tests/checkers/unittest_symilar.py @@ -3,15 +3,19 @@ # Copyright (c) https://github.com/pylint-dev/pylint/blob/main/CONTRIBUTORS.txt from contextlib import redirect_stdout -from io import StringIO +from io import BytesIO, StringIO from pathlib import Path +import astroid import pytest from _pytest.capture import CaptureFixture from pylint.checkers import symilar +from pylint.checkers.symilar import LineSet, LineSetStartCouple, Symilar from pylint.lint import PyLinter +from pylint.reporters.ureports.nodes import Section, Table from pylint.testutils import GenericTestReporter as Reporter +from pylint.utils import LinterStats INPUT = Path(__file__).parent / ".." / "input" SIMILAR1 = str(INPUT / "similar1") @@ -470,3 +474,103 @@ def test_bad_short_form_option(capsys: CaptureFixture) -> None: assert ex.value.code == 2 assert not out assert "unrecognized arguments: -j=0" in err + + +def test_line_set_start_couple_eq_non_couple() -> None: + """``LineSetStartCouple.__eq__`` returns ``NotImplemented`` against an + object that isn't a ``LineSetStartCouple`` so Python falls back to + reflected comparison. + """ + couple = LineSetStartCouple(symilar.Index(0), symilar.Index(1)) + assert couple != object() + # pylint: disable-next=unnecessary-dunder-call + assert couple.__eq__(object()) is NotImplemented + + +def test_line_set_dunder_methods() -> None: + """Cover LineSet ``__str__``, ``__getitem__`` and non-LineSet ``__eq__``.""" + lines = ["a = 1\n", "b = 2\n", "c = 3\n"] + lineset = LineSet("fake.py", lines) + assert str(lineset) == "" + assert lineset[0].text == "a = 1" + assert (lineset == "not a lineset") is False + + +def test_append_stream_binary_requires_encoding() -> None: + """``append_stream`` raises ValueError when a binary stream is passed + without an encoding. + """ + runner = Symilar() + with pytest.raises(ValueError): + runner.append_stream("bin.py", BytesIO(b"a = 1\n")) + + +def test_report_similarities_builds_table() -> None: + """``report_similarities`` appends a stats table to the given section.""" + stats = LinterStats() + stats.reset_duplicated_lines() + section = Section() + symilar.report_similarities(section, stats, None) + assert len(section.children) == 1 + assert isinstance(section.children[0], Table) + + +def test_process_module_warns_when_current_name_is_none(tmp_path: Path) -> None: + """``SimilaritiesChecker.process_module`` warns when + ``linter.current_name`` is None (the deprecated state). + """ + linter = PyLinter(reporter=Reporter()) + linter.register_checker(symilar.SimilaritiesChecker(linter)) + checker = symilar.SimilaritiesChecker(linter) + linter.current_name = None # type: ignore[assignment] + module_file = tmp_path / "m.py" + module_file.write_text("a = 1\n") + + module = astroid.parse(module_file.read_text(), module_name="m") + module.file = str(module_file) + module.file_bytes = module_file.read_bytes() + with pytest.warns(DeprecationWarning, match="current_name attribute"): + checker.process_module(module) + + +def test_append_stream_unicode_error_yields_empty_lineset(tmp_path: Path) -> None: + """``append_stream`` swallows ``UnicodeDecodeError`` and treats the file + as empty rather than crashing. + """ + bad_file = tmp_path / "bad.py" + bad_file.write_bytes(b"\xff\xfe\xfa not valid utf-8\n") + runner = Symilar() + with bad_file.open("rb") as stream: + runner.append_stream(str(bad_file), stream, encoding="utf-8") + assert len(runner.linesets) == 1 + assert runner.linesets[0].stripped_lines == [] + + +def test_hash_bucket_product_limit_fallback( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + """When a hash bucket's Cartesian product exceeds + ``_HASH_BUCKET_PRODUCT_LIMIT``, ``_find_common`` falls back to aligned-zip + pairing. Mock the limit to zero so the fallback path is always taken over + a file with repeated blocks and verify duplicate detection still reports + the expected similar lines. + + Regression test for https://github.com/pylint-dev/pylint/pull/10881. + """ + monkeypatch.setattr(symilar, "_HASH_BUCKET_PRODUCT_LIMIT", 0) + # Three copies of the same 5-line block produce hash buckets with more + # than one index, exercising the aligned-zip fallback meaningfully. + block = "a = 1\nb = 2\nc = 3\nd = 4\ne = 5\n" + file_a = tmp_path / "a.py" + file_b = tmp_path / "b.py" + file_a.write_text(block * 3) + file_b.write_text(block * 3) + + output = StringIO() + with redirect_stdout(output), pytest.raises(SystemExit) as ex: + symilar.Run([str(file_a), str(file_b)]) + assert ex.value.code == 0 + out = output.getvalue() + assert "15 similar lines in 2 files" in out + assert f"=={file_a}:[0:15]" in out + assert f"=={file_b}:[0:15]" in out diff --git a/tests/primer/test_primer_stdlib.py b/tests/primer/test_primer_stdlib.py index cd64c212b1..456a7c56a9 100644 --- a/tests/primer/test_primer_stdlib.py +++ b/tests/primer/test_primer_stdlib.py @@ -59,9 +59,8 @@ def test_primer_stdlib_no_crash( try: # We want to test all the code we can enables = ["--enable-all-extensions", "--enable=all"] - # Duplicate code takes too long and is relatively safe # We don't want to lint the test directory which are repetitive - disables = ["--disable=duplicate-code", "--ignore=test"] + disables = ["--ignore=test"] with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) Run([test_module_name, *enables, *disables])