@@ -9,15 +9,17 @@ def get_random_data(size):
99 return base_pattern
1010
1111def generate_file (filename , target_size ):
12- print (f"Generating { filename } ({ target_size } bytes)..." )
1312 pattern = get_random_data (100 )
14-
13+ write_pattern (filename , target_size , pattern )
14+
15+ def write_pattern (filename , target_size , pattern ):
16+ print (f"Generating { filename } ({ target_size } bytes)..." )
1517 with open (filename , 'wb' ) as f :
1618 bytes_written = 0
1719 chunk_size = 1024 * 1024
1820 large_chunk = pattern * (chunk_size // len (pattern ) + 1 )
1921 large_chunk = large_chunk [:chunk_size ]
20-
22+
2123 while bytes_written < target_size :
2224 remaining = target_size - bytes_written
2325 write_amt = min (remaining , len (large_chunk ))
@@ -40,37 +42,42 @@ def main():
4042 for name , size in sizes .items ():
4143 generate_file (f"bench_data/data_{ name } .bin" , size )
4244
43- generate_offset8 ("bench_data/data_offset8.bin" , 1024 * 1024 )
44- generate_offset7 ("bench_data/data_offset7.bin" , 1024 * 1024 )
45- generate_offset3 ("bench_data/data_offset3.bin" , 1024 * 1024 )
46- generate_offset5 ("bench_data/data_offset5.bin" , 1024 * 1024 )
47- generate_offset1 ("bench_data/data_offset1.bin" , 1024 * 1024 )
48- generate_offset2 ("bench_data/data_offset2.bin" , 1024 * 1024 )
49- generate_offset4 ("bench_data/data_offset4.bin" , 1024 * 1024 )
50- generate_offset9 ("bench_data/data_offset9.bin" , 1024 * 1024 )
51- generate_offset10 ("bench_data/data_offset10.bin" , 1024 * 1024 )
52- generate_offset11 ("bench_data/data_offset11.bin" , 1024 * 1024 )
53- generate_offset12 ("bench_data/data_offset12.bin" , 1024 * 1024 )
54- generate_offset13 ("bench_data/data_offset13.bin" , 1024 * 1024 )
55- generate_offset14 ("bench_data/data_offset14.bin" , 1024 * 1024 )
56- generate_offset15 ("bench_data/data_offset15.bin" , 1024 * 1024 )
57- generate_offset16 ("bench_data/data_offset16.bin" , 1024 * 1024 )
58- generate_offset17 ("bench_data/data_offset17.bin" , 1024 * 1024 )
59- generate_offset18 ("bench_data/data_offset18.bin" , 1024 * 1024 )
60- generate_offset19 ("bench_data/data_offset19.bin" , 1024 * 1024 )
61- generate_offset20 ("bench_data/data_offset20.bin" , 1024 * 1024 )
62- generate_offset21 ("bench_data/data_offset21.bin" , 1024 * 1024 )
63- generate_offset22 ("bench_data/data_offset22.bin" , 1024 * 1024 )
64- generate_offset23 ("bench_data/data_offset23.bin" , 1024 * 1024 )
65- generate_offset24 ("bench_data/data_offset24.bin" , 1024 * 1024 )
66- generate_offset25 ("bench_data/data_offset25.bin" , 1024 * 1024 )
67- generate_offset26 ("bench_data/data_offset26.bin" , 1024 * 1024 )
68- generate_offset27 ("bench_data/data_offset27.bin" , 1024 * 1024 )
69- generate_offset28 ("bench_data/data_offset28.bin" , 1024 * 1024 )
70- generate_offset29 ("bench_data/data_offset29.bin" , 1024 * 1024 )
71- generate_offset30 ("bench_data/data_offset30.bin" , 1024 * 1024 )
72- generate_offset31 ("bench_data/data_offset31.bin" , 1024 * 1024 )
73- generate_offset32 ("bench_data/data_offset32.bin" , 1024 * 1024 )
45+ patterns = {
46+ 1 : b"1" ,
47+ 2 : b"12" ,
48+ 3 : b"123" ,
49+ 4 : b"1234" ,
50+ 5 : b"12345" ,
51+ 7 : b"1234567" ,
52+ 8 : b"12345678" ,
53+ 9 : b"123456789" ,
54+ 10 : b"1234567890" ,
55+ 11 : b"12345678901" ,
56+ 12 : b"123456789012" ,
57+ 13 : b"1234567890123" ,
58+ 14 : b"12345678901234" ,
59+ 15 : b"123456789012345" ,
60+ 16 : b"1234567890123456" ,
61+ 17 : b"12345678901234567" ,
62+ 18 : b"123456789012345678" ,
63+ 19 : b"1234567890123456789" ,
64+ 20 : b"ABCDEFGHIJKLMNOPQRST" ,
65+ 21 : b"ABCDEFGHIJKLMNOPQRSTU" ,
66+ 22 : b"ABCDEFGHIJKLMNOPQRSTUV" ,
67+ 23 : b"ABCDEFGHIJKLMNOPQRSTUVW" ,
68+ 24 : b"ABCDEFGHIJKLMNOPQRSTUVWX" ,
69+ 25 : b"ABCDEFGHIJKLMNOPQRSTUVWXY" ,
70+ 26 : b"ABCDEFGHIJKLMNOPQRSTUVWXYZ" ,
71+ 27 : b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0" ,
72+ 28 : b"ABCDEFGHIJKLMNOPQRSTUVWXYZ01" ,
73+ 29 : b"ABCDEFGHIJKLMNOPQRSTUVWXYZ012" ,
74+ 30 : b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123" ,
75+ 31 : b"ABCDEFGHIJKLMNOPQRSTUVWXYZ01234" ,
76+ 32 : b"ABCDEFGHIJKLMNOPQRSTUVWXYZ012345"
77+ }
78+
79+ for offset , pattern in patterns .items ():
80+ write_pattern (f"bench_data/data_offset{ offset } .bin" , 1024 * 1024 , pattern )
7481
7582 # Small match variants
7683 # For offset N, we want matches of length N (or close to N).
@@ -91,186 +98,5 @@ def generate_offset_small(filename, target_size, pattern):
9198 f .write (bytes ([random .randint (0 , 255 )]))
9299 bytes_written += len (pattern ) * 2 + 1
93100
94- def generate_offset1 (filename , target_size ):
95- print (f"Generating { filename } ({ target_size } bytes)..." )
96- pattern = b"1"
97- write_pattern (filename , target_size , pattern )
98-
99- def generate_offset12 (filename , target_size ):
100- print (f"Generating { filename } ({ target_size } bytes)..." )
101- pattern = b"123456789012"
102- write_pattern (filename , target_size , pattern )
103-
104- def generate_offset2 (filename , target_size ):
105- print (f"Generating { filename } ({ target_size } bytes)..." )
106- pattern = b"12"
107- write_pattern (filename , target_size , pattern )
108-
109- def generate_offset4 (filename , target_size ):
110- print (f"Generating { filename } ({ target_size } bytes)..." )
111- pattern = b"1234"
112- write_pattern (filename , target_size , pattern )
113-
114- def generate_offset3 (filename , target_size ):
115- print (f"Generating { filename } ({ target_size } bytes)..." )
116- pattern = b"123"
117- write_pattern (filename , target_size , pattern )
118-
119- def generate_offset8 (filename , target_size ):
120- print (f"Generating { filename } ({ target_size } bytes)..." )
121- pattern = b"12345678"
122- write_pattern (filename , target_size , pattern )
123-
124- def generate_offset7 (filename , target_size ):
125- print (f"Generating { filename } ({ target_size } bytes)..." )
126- pattern = b"1234567"
127- write_pattern (filename , target_size , pattern )
128-
129- def generate_offset5 (filename , target_size ):
130- print (f"Generating { filename } ({ target_size } bytes)..." )
131- pattern = b"12345"
132- write_pattern (filename , target_size , pattern )
133-
134- def generate_offset9 (filename , target_size ):
135- print (f"Generating { filename } ({ target_size } bytes)..." )
136- pattern = b"123456789"
137- write_pattern (filename , target_size , pattern )
138-
139- def generate_offset10 (filename , target_size ):
140- print (f"Generating { filename } ({ target_size } bytes)..." )
141- pattern = b"1234567890"
142- write_pattern (filename , target_size , pattern )
143-
144- def generate_offset11 (filename , target_size ):
145- print (f"Generating { filename } ({ target_size } bytes)..." )
146- pattern = b"12345678901"
147- write_pattern (filename , target_size , pattern )
148-
149- def generate_offset15 (filename , target_size ):
150- print (f"Generating { filename } ({ target_size } bytes)..." )
151- pattern = b"123456789012345"
152- write_pattern (filename , target_size , pattern )
153-
154- def generate_offset13 (filename , target_size ):
155- print (f"Generating { filename } ({ target_size } bytes)..." )
156- pattern = b"1234567890123"
157- write_pattern (filename , target_size , pattern )
158-
159- def generate_offset14 (filename , target_size ):
160- print (f"Generating { filename } ({ target_size } bytes)..." )
161- pattern = b"12345678901234"
162- write_pattern (filename , target_size , pattern )
163-
164- def generate_offset16 (filename , target_size ):
165- print (f"Generating { filename } ({ target_size } bytes)..." )
166- pattern = b"1234567890123456"
167- write_pattern (filename , target_size , pattern )
168-
169- def generate_offset17 (filename , target_size ):
170- print (f"Generating { filename } ({ target_size } bytes)..." )
171- pattern = b"12345678901234567"
172- write_pattern (filename , target_size , pattern )
173-
174- def generate_offset18 (filename , target_size ):
175- print (f"Generating { filename } ({ target_size } bytes)..." )
176- pattern = b"123456789012345678"
177- write_pattern (filename , target_size , pattern )
178-
179- def generate_offset19 (filename , target_size ):
180- print (f"Generating { filename } ({ target_size } bytes)..." )
181- pattern = b"1234567890123456789"
182- write_pattern (filename , target_size , pattern )
183-
184- def generate_offset21 (filename , target_size ):
185- print (f"Generating { filename } ({ target_size } bytes)..." )
186- # 21 unique bytes to avoid inner matches
187- pattern = b"ABCDEFGHIJKLMNOPQRSTU"
188- write_pattern (filename , target_size , pattern )
189-
190- def generate_offset20 (filename , target_size ):
191- print (f"Generating { filename } ({ target_size } bytes)..." )
192- # 20 unique bytes to avoid inner matches
193- pattern = b"ABCDEFGHIJKLMNOPQRST"
194- write_pattern (filename , target_size , pattern )
195-
196- def generate_offset22 (filename , target_size ):
197- print (f"Generating { filename } ({ target_size } bytes)..." )
198- # 22 unique bytes to avoid inner matches
199- pattern = b"ABCDEFGHIJKLMNOPQRSTUV"
200- write_pattern (filename , target_size , pattern )
201-
202- def write_pattern (filename , target_size , pattern ):
203- with open (filename , 'wb' ) as f :
204- bytes_written = 0
205- chunk_size = 1024 * 1024
206- large_chunk = pattern * (chunk_size // len (pattern ) + 1 )
207- large_chunk = large_chunk [:chunk_size ]
208-
209- while bytes_written < target_size :
210- remaining = target_size - bytes_written
211- write_amt = min (remaining , len (large_chunk ))
212- f .write (large_chunk [:write_amt ])
213- bytes_written += write_amt
214-
215- def generate_offset23 (filename , target_size ):
216- print (f"Generating { filename } ({ target_size } bytes)..." )
217- # 23 unique bytes to avoid inner matches
218- pattern = b"ABCDEFGHIJKLMNOPQRSTUVW"
219- write_pattern (filename , target_size , pattern )
220-
221- def generate_offset24 (filename , target_size ):
222- print (f"Generating { filename } ({ target_size } bytes)..." )
223- # 24 unique bytes to avoid inner matches
224- pattern = b"ABCDEFGHIJKLMNOPQRSTUVWX"
225- write_pattern (filename , target_size , pattern )
226-
227- def generate_offset25 (filename , target_size ):
228- print (f"Generating { filename } ({ target_size } bytes)..." )
229- # 25 unique bytes to avoid inner matches
230- pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXY"
231- write_pattern (filename , target_size , pattern )
232-
233- def generate_offset26 (filename , target_size ):
234- print (f"Generating { filename } ({ target_size } bytes)..." )
235- # 26 unique bytes to avoid inner matches
236- pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
237- write_pattern (filename , target_size , pattern )
238-
239- def generate_offset27 (filename , target_size ):
240- print (f"Generating { filename } ({ target_size } bytes)..." )
241- # 27 unique bytes to avoid inner matches
242- pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0"
243- write_pattern (filename , target_size , pattern )
244-
245- def generate_offset28 (filename , target_size ):
246- print (f"Generating { filename } ({ target_size } bytes)..." )
247- # 28 unique bytes to avoid inner matches
248- pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ01"
249- write_pattern (filename , target_size , pattern )
250-
251- def generate_offset29 (filename , target_size ):
252- print (f"Generating { filename } ({ target_size } bytes)..." )
253- # 29 unique bytes to avoid inner matches
254- pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ012"
255- write_pattern (filename , target_size , pattern )
256-
257- def generate_offset30 (filename , target_size ):
258- print (f"Generating { filename } ({ target_size } bytes)..." )
259- # 30 unique bytes to avoid inner matches
260- pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123"
261- write_pattern (filename , target_size , pattern )
262-
263- def generate_offset31 (filename , target_size ):
264- print (f"Generating { filename } ({ target_size } bytes)..." )
265- # 31 unique bytes to avoid inner matches
266- pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ01234"
267- write_pattern (filename , target_size , pattern )
268-
269- def generate_offset32 (filename , target_size ):
270- print (f"Generating { filename } ({ target_size } bytes)..." )
271- # 32 unique bytes to avoid inner matches
272- pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ012345"
273- write_pattern (filename , target_size , pattern )
274-
275101if __name__ == "__main__" :
276102 main ()
0 commit comments