Skip to content

Commit 3091005

Browse files
committed
fix: update skydemonorder crawler for Livewire migration
The site migrated from Alpine.js inline data on <section> elements to Laravel Livewire lazy-loaded components. The crawler now extracts the chapter list by triggering a Livewire lazy-load POST request and parsing the response HTML. Also fixes the cover image CSS selector which was missing the f-string prefix.
1 parent 74b9a96 commit 3091005

1 file changed

Lines changed: 50 additions & 23 deletions

File tree

sources/en/s/skydemonorder.py

Lines changed: 50 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
22
import logging
3+
import re
34

45
from lncrawl.core import Chapter, LegacyCrawler, Volume
56

@@ -18,31 +19,57 @@ def read_novel_info(self) -> None:
1819
self.novel_title = possible_title
1920
logger.info("Novel title: %s", self.novel_title)
2021

21-
possible_image = soup.select_one("img[alt='{self.novel_title}']")
22+
possible_image = soup.select_one(f"img[alt='{self.novel_title}']")
2223
if possible_image:
2324
self.novel_cover = possible_image["src"]
2425
logger.info("Novel cover: %s", self.novel_cover)
2526

26-
section = soup.find_all("section", attrs={"x-data": True})
27-
28-
if len(section) == 1:
29-
chapter_section = section[0]
30-
else:
31-
chapter_section = section[1]
32-
33-
section_data = "".join(chapter_section.get("x-data").split("})(")[1].split())[
34-
:-2
35-
] # remove all whitespace & remove last 2 chars
36-
37-
chapters_obj = json.loads(section_data)
38-
39-
if isinstance(chapters_obj, list) is False:
40-
chapters = []
41-
for _, value in chapters_obj.items():
42-
chapters.append(value)
43-
44-
else:
45-
chapters = chapters_obj
27+
# Extract CSRF token for Livewire request
28+
csrf_meta = soup.select_one('meta[name="csrf-token"]')
29+
assert csrf_meta, "No CSRF token found"
30+
csrf_token = csrf_meta["content"]
31+
32+
# Extract Livewire update URL from script tag
33+
lw_script = soup.select_one('script[src*="livewire"]')
34+
assert lw_script, "No Livewire script tag found"
35+
lw_match = re.search(r"(livewire-[a-f0-9]+)", lw_script["src"])
36+
assert lw_match, "Could not extract Livewire path"
37+
livewire_url = self.absolute_url(f"/{lw_match.group(1)}/update")
38+
39+
# Extract snapshot from the lazy-loaded chapter-list component
40+
lw_div = soup.find("div", attrs={"wire:name": "project.chapter-list"})
41+
assert lw_div, "No Livewire chapter-list component found"
42+
snapshot = lw_div["wire:snapshot"]
43+
44+
# Fetch chapter list via Livewire lazy-load
45+
lw_response = self.post_json(
46+
livewire_url,
47+
data={"components": [{"snapshot": snapshot, "updates": {}, "calls": []}]},
48+
headers={"X-CSRF-TOKEN": csrf_token, "X-Livewire": ""},
49+
)
50+
51+
chapter_html = lw_response["components"][0]["effects"]["html"]
52+
chapter_soup = self.make_soup(chapter_html)
53+
54+
# Parse chapter data from Alpine.js x-data in the response
55+
xdata_div = chapter_soup.select_one("div[x-data]")
56+
assert xdata_div, "No x-data div in chapter list response"
57+
x_data = xdata_div["x-data"]
58+
59+
free_match = re.search(r"freeChapters:\s*JSON\.parse\('(.+?)'\)", x_data)
60+
assert free_match, "Could not extract freeChapters from x-data"
61+
# Decode JS string escapes before parsing JSON
62+
raw_json = re.sub(
63+
r"\\u([0-9a-fA-F]{4})",
64+
lambda m: chr(int(m.group(1), 16)),
65+
free_match.group(1),
66+
)
67+
raw_json = raw_json.replace("\\'", "'")
68+
chapters = json.loads(raw_json)
69+
70+
slug_match = re.search(r"projectSlug:\s*'([^']+)'", x_data)
71+
assert slug_match, "Could not extract projectSlug from x-data"
72+
project_slug = slug_match.group(1)
4673

4774
chapters.reverse()
4875

@@ -57,8 +84,8 @@ def read_novel_info(self) -> None:
5784
Chapter(
5885
id=chap_id,
5986
volume=vol_id,
60-
url=self._make_url(item["slug"], item["project"]["slug"]),
61-
title=item["full_title"],
87+
url=self._make_url(item["slug"], project_slug),
88+
title=item["title"],
6289
)
6390
)
6491

0 commit comments

Comments
 (0)