@@ -77,6 +77,14 @@ class LineSpecifs(NamedTuple):
7777# Links index in the lineset's stripped lines to the real lines in the file
7878IndexToLines_T = dict [Index , "SuccessiveLinesLimits" ]
7979
80+
81+ class LineSetHashResult (NamedTuple ):
82+ """Pre-computed hash data for a LineSet, used to speed up similarity lookup."""
83+
84+ hash_to_index : HashToIndex_T
85+ index_to_lines : IndexToLines_T
86+
87+
8088# The types the streams read by pylint can take. Originating from astroid.nodes.Module.stream() and open()
8189STREAM_TYPES : TypeAlias = TextIO | BufferedReader | BytesIO
8290
@@ -166,25 +174,25 @@ def increment(self, value: Index) -> LineSetStartCouple:
166174
167175def hash_lineset (
168176 lineset : LineSet , min_common_lines : int = DEFAULT_MIN_SIMILARITY_LINE
169- ) -> tuple [ HashToIndex_T , IndexToLines_T ] :
170- """Return two dicts .
177+ ) -> LineSetHashResult :
178+ """Return pre-computed hash data for a lineset .
171179
172- The first associates the hash of successive stripped lines of a lineset
173- to the indices of the starting lines.
174- The second dict, associates the index of the starting line in the lineset's stripped lines to the
175- couple [start, end] lines number in the corresponding file.
180+ The result contains two dicts:
181+ - hash_to_index: maps the hash of successive stripped lines to the starting
182+ indices (in lineset's stripped lines) of the chunks that produced that hash.
183+ - index_to_lines: maps the index of the starting line in the lineset's stripped
184+ lines to the start and end line numbers in the corresponding file.
176185
177186 :param lineset: lineset object (i.e the lines in a file)
178187 :param min_common_lines: number of successive lines that are used to compute the hash
179- :return: a dict linking hashes to corresponding start index and a dict that links this
180- index to the start and end lines in the file
188+ :return: a LineSetHashResult with hash-to-index and index-to-lines mappings
181189 """
182190 hash_to_index : HashToIndex_T = defaultdict (list )
183191 index_to_lines : IndexToLines_T = {}
184192 stripped = lineset .stripped_lines
185193 num_lines = len (stripped )
186194 if num_lines < min_common_lines :
187- return hash_to_index , index_to_lines
195+ return LineSetHashResult ( hash_to_index , index_to_lines )
188196
189197 # Pre-compute per-line hashes for the rolling window
190198 line_hashes = [hash (spec .text ) for spec in stripped ]
@@ -213,7 +221,7 @@ def hash_lineset(
213221 if window_end <= last_index :
214222 window_hash = window_hash - line_hashes [i ] + line_hashes [window_end ]
215223
216- return hash_to_index , index_to_lines
224+ return LineSetHashResult ( hash_to_index , index_to_lines )
217225
218226
219227def remove_successive (all_couples : CplIndexToCplLines_T ) -> None :
@@ -434,13 +442,12 @@ def _get_similarity_report(
434442 )
435443 return report
436444
437- # pylint: disable = too-many-locals
438445 def _find_common (
439446 self ,
440447 lineset1 : LineSet ,
441448 lineset2 : LineSet ,
442- hashes1 : tuple [ HashToIndex_T , IndexToLines_T ] | None = None ,
443- hashes2 : tuple [ HashToIndex_T , IndexToLines_T ] | None = None ,
449+ hashes1 : LineSetHashResult ,
450+ hashes2 : LineSetHashResult ,
444451 ) -> Generator [Commonality ]:
445452 """Find similarities in the two given linesets.
446453
@@ -454,39 +461,24 @@ def _find_common(
454461 account common chunk of lines that have more than the minimal number of
455462 successive lines required.
456463 """
457- hash_to_index_1 : HashToIndex_T
458- hash_to_index_2 : HashToIndex_T
459- index_to_lines_1 : IndexToLines_T
460- index_to_lines_2 : IndexToLines_T
461- if hashes1 is not None :
462- hash_to_index_1 , index_to_lines_1 = hashes1
463- else :
464- hash_to_index_1 , index_to_lines_1 = hash_lineset (
465- lineset1 , self .namespace .min_similarity_lines
466- )
467- if hashes2 is not None :
468- hash_to_index_2 , index_to_lines_2 = hashes2
469- else :
470- hash_to_index_2 , index_to_lines_2 = hash_lineset (
471- lineset2 , self .namespace .min_similarity_lines
472- )
473-
474- common_hashes = hash_to_index_1 .keys () & hash_to_index_2 .keys ()
464+ common_hashes = hashes1 .hash_to_index .keys () & hashes2 .hash_to_index .keys ()
475465
476466 # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of
477467 # successive common lines, to the corresponding starting and ending number lines in both files
478468 all_couples : CplIndexToCplLines_T = {}
479469
480- for chunk_hash in sorted (common_hashes , key = lambda h : hash_to_index_1 [h ][0 ]):
470+ for chunk_hash in sorted (
471+ common_hashes , key = lambda h : hashes1 .hash_to_index [h ][0 ]
472+ ):
481473 for indices_in_linesets in itertools .product (
482- hash_to_index_1 [chunk_hash ], hash_to_index_2 [chunk_hash ]
474+ hashes1 . hash_to_index [chunk_hash ], hashes2 . hash_to_index [chunk_hash ]
483475 ):
484476 index_1 = indices_in_linesets [0 ]
485477 index_2 = indices_in_linesets [1 ]
486478 all_couples [LineSetStartCouple (index_1 , index_2 )] = (
487479 CplSuccessiveLinesLimits (
488- copy .copy (index_to_lines_1 [index_1 ]),
489- copy .copy (index_to_lines_2 [index_2 ]),
480+ copy .copy (hashes1 . index_to_lines [index_1 ]),
481+ copy .copy (hashes2 . index_to_lines [index_2 ]),
490482 effective_cmn_lines_nb = self .namespace .min_similarity_lines ,
491483 )
492484 )
@@ -522,17 +514,17 @@ def _iter_sims(self) -> Generator[Commonality]:
522514 min_lines = self .namespace .min_similarity_lines
523515 # Cache hash_lineset results: each lineset is compared against every
524516 # other, so without caching it gets hashed (N-1) times.
525- cache : dict [int , tuple [ HashToIndex_T , IndexToLines_T ] ] = {}
526- for idx , lineset in enumerate (self .linesets [:- 1 ]):
517+ cache : dict [int , LineSetHashResult ] = {}
518+ for idx , lineset1 in enumerate (self .linesets [:- 1 ]):
527519 for lineset2 in self .linesets [idx + 1 :]:
528- lid1 = id (lineset )
529- if lid1 not in cache :
530- cache [lid1 ] = hash_lineset (lineset , min_lines )
531- lid2 = id (lineset2 )
532- if lid2 not in cache :
533- cache [lid2 ] = hash_lineset (lineset2 , min_lines )
520+ key1 = id (lineset1 )
521+ if key1 not in cache :
522+ cache [key1 ] = hash_lineset (lineset1 , min_lines )
523+ key2 = id (lineset2 )
524+ if key2 not in cache :
525+ cache [key2 ] = hash_lineset (lineset2 , min_lines )
534526 yield from self ._find_common (
535- lineset , lineset2 , cache [lid1 ], cache [lid2 ]
527+ lineset1 , lineset2 , cache [key1 ], cache [key2 ]
536528 )
537529
538530 def get_map_data (self ) -> list [LineSet ]:
0 commit comments