-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbatchProcess.py
More file actions
96 lines (87 loc) · 2.58 KB
/
Copy pathbatchProcess.py
File metadata and controls
96 lines (87 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import csv
import re
from time import sleep
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from percy import percy_snapshot
CSV_FILE = "./urls.csv"
NUM_THREADS = 2
def load_urls():
with open(CSV_FILE, newline="") as file:
reader = csv.reader(file)
return [
row[0].strip()
for row in reader
if row and row[0].strip().startswith(("http://", "https://"))
]
def split_list(lst, n):
k, m = divmod(len(lst), n)
return [
lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]
for i in range(n)
]
def get_snapshot_name(url):
parsed = urlparse(url)
hostname = parsed.netloc.replace("www.", "")
path = parsed.path.strip("/")
sanitized_path = re.sub(
r"[^a-zA-Z0-9_-]",
"_",
path
)
if sanitized_path:
return f"{hostname}_{sanitized_path}"
return hostname
def create_driver():
options = Options()
options.add_argument("--headless=new")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=options)
driver.set_window_size(1200, 800)
return driver
def process_urls(urls):
if not urls:
return
driver = create_driver()
try:
for url in urls:
try:
print(f"Loading URL: {url}")
driver.get(url)
sleep(3)
snapshot_name = get_snapshot_name(url)
print(f"Taking Percy snapshot: {snapshot_name}")
percy_snapshot(
driver,
snapshot_name,
widths=[768, 1200]
)
print(f"Snapshot completed: {snapshot_name}")
except Exception as e:
print(f"Failed for URL: {url}")
print(str(e))
finally:
driver.quit()
def main():
urls = load_urls()
if not urls:
print("No URLs found in urls.csv")
return
print(f"Found {len(urls)} URLs")
print(f"Using {NUM_THREADS} parallel threads")
batches = split_list(urls, NUM_THREADS)
with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
futures = [
executor.submit(process_urls, batch)
for batch in batches
if batch
]
for future in futures:
future.result()
print("All snapshots completed")
if __name__ == "__main__":
main()