Skip to content

Commit abf4dd0

Browse files
authored
Fix Steam Workshop mod changelog not being scraped correctly (#3097)
1 parent 509c8f4 commit abf4dd0

1 file changed

Lines changed: 35 additions & 57 deletions

File tree

.github/scripts/steam_changelog_scraper.py

Lines changed: 35 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -74,34 +74,38 @@ def html_to_text(fragment):
7474
return text.strip()
7575

7676

77-
def parse_timestamp(date_text, raw_html, raw_search_start):
78-
raw_index = raw_html.find(date_text, raw_search_start)
79-
if raw_index != -1:
80-
snippet = raw_html[max(0, raw_index - 600):raw_index + 600]
81-
attr_match = re.search(r'data-(?:timestamp|rtime(?:_updated)?|time_updated)="(\d+)"', snippet)
82-
if attr_match:
83-
return int(attr_match.group(1)), raw_index + len(date_text)
84-
85-
# Handle date format without year (e.g., "21 Mar @ 1:19pm")
86-
# Add current year; if parsing fails in the future, adjust to previous year
87-
from datetime import datetime as dt_module
88-
current_year = dt_module.now(timezone.utc).year
89-
try:
90-
parsed = datetime.strptime(f"{date_text} {current_year}", "%d %b @ %I:%M%p %Y")
91-
except ValueError:
92-
# If current year doesn't work, try previous year
93-
try:
94-
parsed = datetime.strptime(f"{date_text} {current_year - 1}", "%d %b @ %I:%M%p %Y")
95-
except ValueError:
96-
# Fallback: try the old format just in case
97-
try:
98-
parsed = datetime.strptime(date_text, "%d %b, %Y @ %I:%M%p")
99-
except ValueError:
100-
# If all else fails, return a safe fallback
101-
parsed = datetime.now(timezone.utc)
102-
103-
parsed = parsed.replace(tzinfo=timezone.utc)
104-
return int(parsed.timestamp()), raw_index + len(date_text) if raw_index != -1 else raw_search_start
77+
def extract_entries_from_page(raw_html, previous_ts):
78+
entries = []
79+
sections = re.split(
80+
r'<div class="detailBox workshopAnnouncement noFooter changeLogCtn">',
81+
raw_html,
82+
flags=re.IGNORECASE,
83+
)
84+
85+
for section in sections[1:]:
86+
timestamp_match = re.search(r'<p id="(?P<timestamp>\d+)">', section, flags=re.IGNORECASE)
87+
if not timestamp_match:
88+
continue
89+
90+
entry_ts = int(timestamp_match.group("timestamp"))
91+
if entry_ts <= previous_ts:
92+
break
93+
94+
body_match = re.search(
95+
r'<p id="\d+">(?P<body>.*?)</p>',
96+
section,
97+
flags=re.IGNORECASE | re.DOTALL,
98+
)
99+
body_html = body_match.group("body") if body_match else ""
100+
body = html_to_text(body_html)
101+
102+
if not body:
103+
body = "No changelog details were provided for this update."
104+
105+
entry_date = datetime.fromtimestamp(entry_ts, tz=timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
106+
entries.append(f"### {entry_date}\n\n{body}")
107+
108+
return entries
105109

106110

107111
def main():
@@ -121,44 +125,18 @@ def main():
121125
"Accept-Language": "en-US,en;q=0.9",
122126
}
123127

124-
header_re = re.compile(
125-
r"Update:\s+(?P<date>\d{1,2}\s+[A-Za-z]{3}\s+@\s+\d{1,2}:\d{2}[ap]m)\s+by\s+(?P<author>.*?)(?:\n|$)"
126-
)
127-
128128
entries = []
129129
page_number = 1
130130

131131
try:
132132
while page_number <= 50:
133133
raw_html = fetch_page(base_url, headers, page_number)
134-
page_text = html_to_text(raw_html)
135-
matches = list(header_re.finditer(page_text))
134+
page_entries = extract_entries_from_page(raw_html, previous_ts)
136135

137-
if not matches:
136+
if not page_entries:
138137
break
139138

140-
raw_search_start = 0
141-
for index, match in enumerate(matches):
142-
next_start = matches[index + 1].start() if index + 1 < len(matches) else len(page_text)
143-
body = page_text[match.end():next_start].strip()
144-
145-
footer_split = re.split(
146-
r"\n(?:Showing\s+\d+-\d+\s+of\s+\d+\s+entries|Additional Links)\b",
147-
body,
148-
maxsplit=1,
149-
)
150-
body = footer_split[0].strip()
151-
152-
entry_ts, raw_search_start = parse_timestamp(match.group("date"), raw_html, raw_search_start)
153-
if entry_ts <= previous_ts:
154-
raise StopIteration
155-
156-
if not body:
157-
body = "No changelog details were provided for this update."
158-
159-
entry_date = datetime.fromtimestamp(entry_ts, tz=timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
160-
entries.append(f"### {entry_date}\n\n{body}")
161-
139+
entries.extend(page_entries)
162140
page_number += 1
163141
except StopIteration:
164142
pass

0 commit comments

Comments
 (0)