3333import copy
3434import functools
3535import itertools
36- import operator
3736import re
3837import sys
3938import warnings
@@ -71,9 +70,9 @@ class LineSpecifs(NamedTuple):
7170 text : str
7271
7372
74- # Links LinesChunk object to the starting indices (in lineset's stripped lines)
75- # of the different chunk of lines that are used to compute the hash
76- HashToIndex_T = dict ["LinesChunk" , list [Index ]]
73+ # Maps the hash of successive stripped lines to the starting indices
74+ # (in lineset's stripped lines) of the chunks that produced that hash.
75+ HashToIndex_T = dict [int , list [Index ]]
7776
7877# Links index in the lineset's stripped lines to the real lines in the file
7978IndexToLines_T = dict [Index , "SuccessiveLinesLimits" ]
@@ -105,45 +104,6 @@ def __init__(
105104CplIndexToCplLines_T = dict ["LineSetStartCouple" , CplSuccessiveLinesLimits ]
106105
107106
108- class LinesChunk :
109- """The LinesChunk object computes and stores the hash of some consecutive stripped
110- lines of a lineset.
111- """
112-
113- __slots__ = ("_fileid" , "_hash" , "_index" )
114-
115- def __init__ (self , fileid : str , num_line : int , * lines : Iterable [str ]) -> None :
116- self ._fileid : str = fileid
117- """The name of the file from which the LinesChunk object is generated."""
118-
119- self ._index : Index = Index (num_line )
120- """The index in the stripped lines that is the starting of consecutive
121- lines.
122- """
123-
124- self ._hash : int = sum (hash (lin ) for lin in lines )
125- """The hash of some consecutive lines."""
126-
127- def __eq__ (self , o : object ) -> bool :
128- if not isinstance (o , LinesChunk ):
129- return NotImplemented
130- return self ._hash == o ._hash
131-
132- def __hash__ (self ) -> int :
133- return self ._hash
134-
135- def __repr__ (self ) -> str :
136- return (
137- f"<LinesChunk object for file { self ._fileid } ({ self ._index } , { self ._hash } )>"
138- )
139-
140- def __str__ (self ) -> str :
141- return (
142- f"LinesChunk object for file { self ._fileid } , starting at line { self ._index } \n "
143- f"Hash is { self ._hash } "
144- )
145-
146-
147107class SuccessiveLinesLimits :
148108 """A class to handle the numbering of begin and end of successive lines.
149109
@@ -219,30 +179,41 @@ def hash_lineset(
219179 :return: a dict linking hashes to corresponding start index and a dict that links this
220180 index to the start and end lines in the file
221181 """
222- hash2index = defaultdict (list )
223- index2lines = {}
224- # Comments, docstring and other specific patterns maybe excluded -> call to stripped_lines
225- # to get only what is desired
226- lines = tuple (x .text for x in lineset .stripped_lines )
227- # Need different iterators on same lines but each one is shifted 1 from the precedent
228- shifted_lines = [iter (lines [i :]) for i in range (min_common_lines )]
229-
230- for i , * succ_lines in enumerate (zip (* shifted_lines )):
231- start_linenumber = LineNumber (lineset .stripped_lines [i ].line_number )
232- try :
233- end_linenumber = lineset .stripped_lines [i + min_common_lines ].line_number
234- except IndexError :
235- end_linenumber = LineNumber (lineset .stripped_lines [- 1 ].line_number + 1 )
182+ hash_to_index : HashToIndex_T = defaultdict (list )
183+ index_to_lines : IndexToLines_T = {}
184+ stripped = lineset .stripped_lines
185+ num_lines = len (stripped )
186+ if num_lines < min_common_lines :
187+ return hash_to_index , index_to_lines
188+
189+ # Pre-compute per-line hashes for the rolling window
190+ line_hashes = [hash (spec .text ) for spec in stripped ]
191+
192+ # Seed the rolling hash with the first window
193+ window_hash = sum (line_hashes [:min_common_lines ])
194+
195+ last_index = num_lines - 1
196+ for i in range (num_lines - min_common_lines + 1 ):
197+ start_linenumber = LineNumber (stripped [i ].line_number )
198+ window_end = i + min_common_lines
199+ end_linenumber = (
200+ stripped [window_end ].line_number
201+ if window_end <= last_index
202+ else LineNumber (stripped [last_index ].line_number + 1 )
203+ )
236204
237205 index = Index (i )
238- index2lines [index ] = SuccessiveLinesLimits (
206+ index_to_lines [index ] = SuccessiveLinesLimits (
239207 start = start_linenumber , end = end_linenumber
240208 )
241209
242- l_c = LinesChunk (lineset .name , index , * succ_lines )
243- hash2index [l_c ].append (index )
210+ hash_to_index [window_hash ].append (index )
211+
212+ # Slide the window: subtract the leaving line, add the entering line
213+ if window_end <= last_index :
214+ window_hash = window_hash - line_hashes [i ] + line_hashes [window_end ]
244215
245- return hash2index , index2lines
216+ return hash_to_index , index_to_lines
246217
247218
248219def remove_successive (all_couples : CplIndexToCplLines_T ) -> None :
@@ -465,7 +436,11 @@ def _get_similarity_report(
465436
466437 # pylint: disable = too-many-locals
467438 def _find_common (
468- self , lineset1 : LineSet , lineset2 : LineSet
439+ self ,
440+ lineset1 : LineSet ,
441+ lineset2 : LineSet ,
442+ hashes1 : tuple [HashToIndex_T , IndexToLines_T ] | None = None ,
443+ hashes2 : tuple [HashToIndex_T , IndexToLines_T ] | None = None ,
469444 ) -> Generator [Commonality ]:
470445 """Find similarities in the two given linesets.
471446
@@ -483,27 +458,28 @@ def _find_common(
483458 hash_to_index_2 : HashToIndex_T
484459 index_to_lines_1 : IndexToLines_T
485460 index_to_lines_2 : IndexToLines_T
486- hash_to_index_1 , index_to_lines_1 = hash_lineset (
487- lineset1 , self .namespace .min_similarity_lines
488- )
489- hash_to_index_2 , index_to_lines_2 = hash_lineset (
490- lineset2 , self .namespace .min_similarity_lines
491- )
492-
493- hash_1 : frozenset [LinesChunk ] = frozenset (hash_to_index_1 .keys ())
494- hash_2 : frozenset [LinesChunk ] = frozenset (hash_to_index_2 .keys ())
461+ if hashes1 is not None :
462+ hash_to_index_1 , index_to_lines_1 = hashes1
463+ else :
464+ hash_to_index_1 , index_to_lines_1 = hash_lineset (
465+ lineset1 , self .namespace .min_similarity_lines
466+ )
467+ if hashes2 is not None :
468+ hash_to_index_2 , index_to_lines_2 = hashes2
469+ else :
470+ hash_to_index_2 , index_to_lines_2 = hash_lineset (
471+ lineset2 , self .namespace .min_similarity_lines
472+ )
495473
496- common_hashes : Iterable [LinesChunk ] = sorted (
497- hash_1 & hash_2 , key = lambda m : hash_to_index_1 [m ][0 ]
498- )
474+ common_hashes = hash_to_index_1 .keys () & hash_to_index_2 .keys ()
499475
500476 # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of
501477 # successive common lines, to the corresponding starting and ending number lines in both files
502478 all_couples : CplIndexToCplLines_T = {}
503479
504- for c_hash in sorted (common_hashes , key = operator . attrgetter ( "_index" ) ):
480+ for chunk_hash in sorted (common_hashes , key = lambda h : hash_to_index_1 [ h ][ 0 ] ):
505481 for indices_in_linesets in itertools .product (
506- hash_to_index_1 [c_hash ], hash_to_index_2 [c_hash ]
482+ hash_to_index_1 [chunk_hash ], hash_to_index_2 [chunk_hash ]
507483 ):
508484 index_1 = indices_in_linesets [0 ]
509485 index_2 = indices_in_linesets [1 ]
@@ -543,9 +519,21 @@ def _iter_sims(self) -> Generator[Commonality]:
543519 """Iterate on similarities among all files, by making a Cartesian
544520 product.
545521 """
522+ min_lines = self .namespace .min_similarity_lines
523+ # Cache hash_lineset results: each lineset is compared against every
524+ # other, so without caching it gets hashed (N-1) times.
525+ cache : dict [int , tuple [HashToIndex_T , IndexToLines_T ]] = {}
546526 for idx , lineset in enumerate (self .linesets [:- 1 ]):
547527 for lineset2 in self .linesets [idx + 1 :]:
548- yield from self ._find_common (lineset , lineset2 )
528+ lid1 = id (lineset )
529+ if lid1 not in cache :
530+ cache [lid1 ] = hash_lineset (lineset , min_lines )
531+ lid2 = id (lineset2 )
532+ if lid2 not in cache :
533+ cache [lid2 ] = hash_lineset (lineset2 , min_lines )
534+ yield from self ._find_common (
535+ lineset , lineset2 , cache [lid1 ], cache [lid2 ]
536+ )
549537
550538 def get_map_data (self ) -> list [LineSet ]:
551539 """Returns the data we can use for a map/reduce process.
0 commit comments