@@ -74,34 +74,38 @@ def html_to_text(fragment):
7474 return text .strip ()
7575
7676
77- def parse_timestamp (date_text , raw_html , raw_search_start ):
78- raw_index = raw_html .find (date_text , raw_search_start )
79- if raw_index != - 1 :
80- snippet = raw_html [max (0 , raw_index - 600 ):raw_index + 600 ]
81- attr_match = re .search (r'data-(?:timestamp|rtime(?:_updated)?|time_updated)="(\d+)"' , snippet )
82- if attr_match :
83- return int (attr_match .group (1 )), raw_index + len (date_text )
84-
85- # Handle date format without year (e.g., "21 Mar @ 1:19pm")
86- # Add current year; if parsing fails in the future, adjust to previous year
87- from datetime import datetime as dt_module
88- current_year = dt_module .now (timezone .utc ).year
89- try :
90- parsed = datetime .strptime (f"{ date_text } { current_year } " , "%d %b @ %I:%M%p %Y" )
91- except ValueError :
92- # If current year doesn't work, try previous year
93- try :
94- parsed = datetime .strptime (f"{ date_text } { current_year - 1 } " , "%d %b @ %I:%M%p %Y" )
95- except ValueError :
96- # Fallback: try the old format just in case
97- try :
98- parsed = datetime .strptime (date_text , "%d %b, %Y @ %I:%M%p" )
99- except ValueError :
100- # If all else fails, return a safe fallback
101- parsed = datetime .now (timezone .utc )
102-
103- parsed = parsed .replace (tzinfo = timezone .utc )
104- return int (parsed .timestamp ()), raw_index + len (date_text ) if raw_index != - 1 else raw_search_start
77+ def extract_entries_from_page (raw_html , previous_ts ):
78+ entries = []
79+ sections = re .split (
80+ r'<div class="detailBox workshopAnnouncement noFooter changeLogCtn">' ,
81+ raw_html ,
82+ flags = re .IGNORECASE ,
83+ )
84+
85+ for section in sections [1 :]:
86+ timestamp_match = re .search (r'<p id="(?P<timestamp>\d+)">' , section , flags = re .IGNORECASE )
87+ if not timestamp_match :
88+ continue
89+
90+ entry_ts = int (timestamp_match .group ("timestamp" ))
91+ if entry_ts <= previous_ts :
92+ break
93+
94+ body_match = re .search (
95+ r'<p id="\d+">(?P<body>.*?)</p>' ,
96+ section ,
97+ flags = re .IGNORECASE | re .DOTALL ,
98+ )
99+ body_html = body_match .group ("body" ) if body_match else ""
100+ body = html_to_text (body_html )
101+
102+ if not body :
103+ body = "No changelog details were provided for this update."
104+
105+ entry_date = datetime .fromtimestamp (entry_ts , tz = timezone .utc ).strftime ("%Y-%m-%d %H:%M UTC" )
106+ entries .append (f"### { entry_date } \n \n { body } " )
107+
108+ return entries
105109
106110
107111def main ():
@@ -121,44 +125,18 @@ def main():
121125 "Accept-Language" : "en-US,en;q=0.9" ,
122126 }
123127
124- header_re = re .compile (
125- r"Update:\s+(?P<date>\d{1,2}\s+[A-Za-z]{3}\s+@\s+\d{1,2}:\d{2}[ap]m)\s+by\s+(?P<author>.*?)(?:\n|$)"
126- )
127-
128128 entries = []
129129 page_number = 1
130130
131131 try :
132132 while page_number <= 50 :
133133 raw_html = fetch_page (base_url , headers , page_number )
134- page_text = html_to_text (raw_html )
135- matches = list (header_re .finditer (page_text ))
134+ page_entries = extract_entries_from_page (raw_html , previous_ts )
136135
137- if not matches :
136+ if not page_entries :
138137 break
139138
140- raw_search_start = 0
141- for index , match in enumerate (matches ):
142- next_start = matches [index + 1 ].start () if index + 1 < len (matches ) else len (page_text )
143- body = page_text [match .end ():next_start ].strip ()
144-
145- footer_split = re .split (
146- r"\n(?:Showing\s+\d+-\d+\s+of\s+\d+\s+entries|Additional Links)\b" ,
147- body ,
148- maxsplit = 1 ,
149- )
150- body = footer_split [0 ].strip ()
151-
152- entry_ts , raw_search_start = parse_timestamp (match .group ("date" ), raw_html , raw_search_start )
153- if entry_ts <= previous_ts :
154- raise StopIteration
155-
156- if not body :
157- body = "No changelog details were provided for this update."
158-
159- entry_date = datetime .fromtimestamp (entry_ts , tz = timezone .utc ).strftime ("%Y-%m-%d %H:%M UTC" )
160- entries .append (f"### { entry_date } \n \n { body } " )
161-
139+ entries .extend (page_entries )
162140 page_number += 1
163141 except StopIteration :
164142 pass
0 commit comments