Skip to content

Commit 2015286

Browse files
Refactor bench file generation to reduce duplication (#371)
- Replace individual `generate_offsetN` functions with a `patterns` dictionary and a loop in `main`. - Move `write_pattern` helper to top level and update it to handle user output. - Refactor `generate_file` to use `write_pattern`. - Maintain deterministic output for benchmark files. Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
1 parent ff6f131 commit 2015286

1 file changed

Lines changed: 41 additions & 215 deletions

File tree

scripts/gen_bench_files.py

Lines changed: 41 additions & 215 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,17 @@ def get_random_data(size):
99
return base_pattern
1010

1111
def generate_file(filename, target_size):
12-
print(f"Generating {filename} ({target_size} bytes)...")
1312
pattern = get_random_data(100)
14-
13+
write_pattern(filename, target_size, pattern)
14+
15+
def write_pattern(filename, target_size, pattern):
16+
print(f"Generating {filename} ({target_size} bytes)...")
1517
with open(filename, 'wb') as f:
1618
bytes_written = 0
1719
chunk_size = 1024 * 1024
1820
large_chunk = pattern * (chunk_size // len(pattern) + 1)
1921
large_chunk = large_chunk[:chunk_size]
20-
22+
2123
while bytes_written < target_size:
2224
remaining = target_size - bytes_written
2325
write_amt = min(remaining, len(large_chunk))
@@ -40,37 +42,42 @@ def main():
4042
for name, size in sizes.items():
4143
generate_file(f"bench_data/data_{name}.bin", size)
4244

43-
generate_offset8("bench_data/data_offset8.bin", 1024 * 1024)
44-
generate_offset7("bench_data/data_offset7.bin", 1024 * 1024)
45-
generate_offset3("bench_data/data_offset3.bin", 1024 * 1024)
46-
generate_offset5("bench_data/data_offset5.bin", 1024 * 1024)
47-
generate_offset1("bench_data/data_offset1.bin", 1024 * 1024)
48-
generate_offset2("bench_data/data_offset2.bin", 1024 * 1024)
49-
generate_offset4("bench_data/data_offset4.bin", 1024 * 1024)
50-
generate_offset9("bench_data/data_offset9.bin", 1024 * 1024)
51-
generate_offset10("bench_data/data_offset10.bin", 1024 * 1024)
52-
generate_offset11("bench_data/data_offset11.bin", 1024 * 1024)
53-
generate_offset12("bench_data/data_offset12.bin", 1024 * 1024)
54-
generate_offset13("bench_data/data_offset13.bin", 1024 * 1024)
55-
generate_offset14("bench_data/data_offset14.bin", 1024 * 1024)
56-
generate_offset15("bench_data/data_offset15.bin", 1024 * 1024)
57-
generate_offset16("bench_data/data_offset16.bin", 1024 * 1024)
58-
generate_offset17("bench_data/data_offset17.bin", 1024 * 1024)
59-
generate_offset18("bench_data/data_offset18.bin", 1024 * 1024)
60-
generate_offset19("bench_data/data_offset19.bin", 1024 * 1024)
61-
generate_offset20("bench_data/data_offset20.bin", 1024 * 1024)
62-
generate_offset21("bench_data/data_offset21.bin", 1024 * 1024)
63-
generate_offset22("bench_data/data_offset22.bin", 1024 * 1024)
64-
generate_offset23("bench_data/data_offset23.bin", 1024 * 1024)
65-
generate_offset24("bench_data/data_offset24.bin", 1024 * 1024)
66-
generate_offset25("bench_data/data_offset25.bin", 1024 * 1024)
67-
generate_offset26("bench_data/data_offset26.bin", 1024 * 1024)
68-
generate_offset27("bench_data/data_offset27.bin", 1024 * 1024)
69-
generate_offset28("bench_data/data_offset28.bin", 1024 * 1024)
70-
generate_offset29("bench_data/data_offset29.bin", 1024 * 1024)
71-
generate_offset30("bench_data/data_offset30.bin", 1024 * 1024)
72-
generate_offset31("bench_data/data_offset31.bin", 1024 * 1024)
73-
generate_offset32("bench_data/data_offset32.bin", 1024 * 1024)
45+
patterns = {
46+
1: b"1",
47+
2: b"12",
48+
3: b"123",
49+
4: b"1234",
50+
5: b"12345",
51+
7: b"1234567",
52+
8: b"12345678",
53+
9: b"123456789",
54+
10: b"1234567890",
55+
11: b"12345678901",
56+
12: b"123456789012",
57+
13: b"1234567890123",
58+
14: b"12345678901234",
59+
15: b"123456789012345",
60+
16: b"1234567890123456",
61+
17: b"12345678901234567",
62+
18: b"123456789012345678",
63+
19: b"1234567890123456789",
64+
20: b"ABCDEFGHIJKLMNOPQRST",
65+
21: b"ABCDEFGHIJKLMNOPQRSTU",
66+
22: b"ABCDEFGHIJKLMNOPQRSTUV",
67+
23: b"ABCDEFGHIJKLMNOPQRSTUVW",
68+
24: b"ABCDEFGHIJKLMNOPQRSTUVWX",
69+
25: b"ABCDEFGHIJKLMNOPQRSTUVWXY",
70+
26: b"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
71+
27: b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0",
72+
28: b"ABCDEFGHIJKLMNOPQRSTUVWXYZ01",
73+
29: b"ABCDEFGHIJKLMNOPQRSTUVWXYZ012",
74+
30: b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123",
75+
31: b"ABCDEFGHIJKLMNOPQRSTUVWXYZ01234",
76+
32: b"ABCDEFGHIJKLMNOPQRSTUVWXYZ012345"
77+
}
78+
79+
for offset, pattern in patterns.items():
80+
write_pattern(f"bench_data/data_offset{offset}.bin", 1024 * 1024, pattern)
7481

7582
# Small match variants
7683
# For offset N, we want matches of length N (or close to N).
@@ -91,186 +98,5 @@ def generate_offset_small(filename, target_size, pattern):
9198
f.write(bytes([random.randint(0, 255)]))
9299
bytes_written += len(pattern) * 2 + 1
93100

94-
def generate_offset1(filename, target_size):
95-
print(f"Generating {filename} ({target_size} bytes)...")
96-
pattern = b"1"
97-
write_pattern(filename, target_size, pattern)
98-
99-
def generate_offset12(filename, target_size):
100-
print(f"Generating {filename} ({target_size} bytes)...")
101-
pattern = b"123456789012"
102-
write_pattern(filename, target_size, pattern)
103-
104-
def generate_offset2(filename, target_size):
105-
print(f"Generating {filename} ({target_size} bytes)...")
106-
pattern = b"12"
107-
write_pattern(filename, target_size, pattern)
108-
109-
def generate_offset4(filename, target_size):
110-
print(f"Generating {filename} ({target_size} bytes)...")
111-
pattern = b"1234"
112-
write_pattern(filename, target_size, pattern)
113-
114-
def generate_offset3(filename, target_size):
115-
print(f"Generating {filename} ({target_size} bytes)...")
116-
pattern = b"123"
117-
write_pattern(filename, target_size, pattern)
118-
119-
def generate_offset8(filename, target_size):
120-
print(f"Generating {filename} ({target_size} bytes)...")
121-
pattern = b"12345678"
122-
write_pattern(filename, target_size, pattern)
123-
124-
def generate_offset7(filename, target_size):
125-
print(f"Generating {filename} ({target_size} bytes)...")
126-
pattern = b"1234567"
127-
write_pattern(filename, target_size, pattern)
128-
129-
def generate_offset5(filename, target_size):
130-
print(f"Generating {filename} ({target_size} bytes)...")
131-
pattern = b"12345"
132-
write_pattern(filename, target_size, pattern)
133-
134-
def generate_offset9(filename, target_size):
135-
print(f"Generating {filename} ({target_size} bytes)...")
136-
pattern = b"123456789"
137-
write_pattern(filename, target_size, pattern)
138-
139-
def generate_offset10(filename, target_size):
140-
print(f"Generating {filename} ({target_size} bytes)...")
141-
pattern = b"1234567890"
142-
write_pattern(filename, target_size, pattern)
143-
144-
def generate_offset11(filename, target_size):
145-
print(f"Generating {filename} ({target_size} bytes)...")
146-
pattern = b"12345678901"
147-
write_pattern(filename, target_size, pattern)
148-
149-
def generate_offset15(filename, target_size):
150-
print(f"Generating {filename} ({target_size} bytes)...")
151-
pattern = b"123456789012345"
152-
write_pattern(filename, target_size, pattern)
153-
154-
def generate_offset13(filename, target_size):
155-
print(f"Generating {filename} ({target_size} bytes)...")
156-
pattern = b"1234567890123"
157-
write_pattern(filename, target_size, pattern)
158-
159-
def generate_offset14(filename, target_size):
160-
print(f"Generating {filename} ({target_size} bytes)...")
161-
pattern = b"12345678901234"
162-
write_pattern(filename, target_size, pattern)
163-
164-
def generate_offset16(filename, target_size):
165-
print(f"Generating {filename} ({target_size} bytes)...")
166-
pattern = b"1234567890123456"
167-
write_pattern(filename, target_size, pattern)
168-
169-
def generate_offset17(filename, target_size):
170-
print(f"Generating {filename} ({target_size} bytes)...")
171-
pattern = b"12345678901234567"
172-
write_pattern(filename, target_size, pattern)
173-
174-
def generate_offset18(filename, target_size):
175-
print(f"Generating {filename} ({target_size} bytes)...")
176-
pattern = b"123456789012345678"
177-
write_pattern(filename, target_size, pattern)
178-
179-
def generate_offset19(filename, target_size):
180-
print(f"Generating {filename} ({target_size} bytes)...")
181-
pattern = b"1234567890123456789"
182-
write_pattern(filename, target_size, pattern)
183-
184-
def generate_offset21(filename, target_size):
185-
print(f"Generating {filename} ({target_size} bytes)...")
186-
# 21 unique bytes to avoid inner matches
187-
pattern = b"ABCDEFGHIJKLMNOPQRSTU"
188-
write_pattern(filename, target_size, pattern)
189-
190-
def generate_offset20(filename, target_size):
191-
print(f"Generating {filename} ({target_size} bytes)...")
192-
# 20 unique bytes to avoid inner matches
193-
pattern = b"ABCDEFGHIJKLMNOPQRST"
194-
write_pattern(filename, target_size, pattern)
195-
196-
def generate_offset22(filename, target_size):
197-
print(f"Generating {filename} ({target_size} bytes)...")
198-
# 22 unique bytes to avoid inner matches
199-
pattern = b"ABCDEFGHIJKLMNOPQRSTUV"
200-
write_pattern(filename, target_size, pattern)
201-
202-
def write_pattern(filename, target_size, pattern):
203-
with open(filename, 'wb') as f:
204-
bytes_written = 0
205-
chunk_size = 1024 * 1024
206-
large_chunk = pattern * (chunk_size // len(pattern) + 1)
207-
large_chunk = large_chunk[:chunk_size]
208-
209-
while bytes_written < target_size:
210-
remaining = target_size - bytes_written
211-
write_amt = min(remaining, len(large_chunk))
212-
f.write(large_chunk[:write_amt])
213-
bytes_written += write_amt
214-
215-
def generate_offset23(filename, target_size):
216-
print(f"Generating {filename} ({target_size} bytes)...")
217-
# 23 unique bytes to avoid inner matches
218-
pattern = b"ABCDEFGHIJKLMNOPQRSTUVW"
219-
write_pattern(filename, target_size, pattern)
220-
221-
def generate_offset24(filename, target_size):
222-
print(f"Generating {filename} ({target_size} bytes)...")
223-
# 24 unique bytes to avoid inner matches
224-
pattern = b"ABCDEFGHIJKLMNOPQRSTUVWX"
225-
write_pattern(filename, target_size, pattern)
226-
227-
def generate_offset25(filename, target_size):
228-
print(f"Generating {filename} ({target_size} bytes)...")
229-
# 25 unique bytes to avoid inner matches
230-
pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXY"
231-
write_pattern(filename, target_size, pattern)
232-
233-
def generate_offset26(filename, target_size):
234-
print(f"Generating {filename} ({target_size} bytes)...")
235-
# 26 unique bytes to avoid inner matches
236-
pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
237-
write_pattern(filename, target_size, pattern)
238-
239-
def generate_offset27(filename, target_size):
240-
print(f"Generating {filename} ({target_size} bytes)...")
241-
# 27 unique bytes to avoid inner matches
242-
pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0"
243-
write_pattern(filename, target_size, pattern)
244-
245-
def generate_offset28(filename, target_size):
246-
print(f"Generating {filename} ({target_size} bytes)...")
247-
# 28 unique bytes to avoid inner matches
248-
pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ01"
249-
write_pattern(filename, target_size, pattern)
250-
251-
def generate_offset29(filename, target_size):
252-
print(f"Generating {filename} ({target_size} bytes)...")
253-
# 29 unique bytes to avoid inner matches
254-
pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ012"
255-
write_pattern(filename, target_size, pattern)
256-
257-
def generate_offset30(filename, target_size):
258-
print(f"Generating {filename} ({target_size} bytes)...")
259-
# 30 unique bytes to avoid inner matches
260-
pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123"
261-
write_pattern(filename, target_size, pattern)
262-
263-
def generate_offset31(filename, target_size):
264-
print(f"Generating {filename} ({target_size} bytes)...")
265-
# 31 unique bytes to avoid inner matches
266-
pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ01234"
267-
write_pattern(filename, target_size, pattern)
268-
269-
def generate_offset32(filename, target_size):
270-
print(f"Generating {filename} ({target_size} bytes)...")
271-
# 32 unique bytes to avoid inner matches
272-
pattern = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ012345"
273-
write_pattern(filename, target_size, pattern)
274-
275101
if __name__ == "__main__":
276102
main()

0 commit comments

Comments
 (0)