From 9d18d6d2e1c640bd7d7d6e6bb6ba8fe26c831741 Mon Sep 17 00:00:00 2001 From: jeremyfelt Date: Wed, 10 Jun 2026 11:00:18 -0700 Subject: [PATCH 1/2] Add a blockHosts config option to block third-party hosts during capture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A page embedding a widget that never lets the network go quiet — a CAPTCHA like Cloudflare Turnstile that polls and retries indefinitely under automation, ad tech, analytics — times out the networkidle wait on every viewport, burning the full retry budget per capture. blockHosts in reglance.json takes bare hostnames and aborts every request to them (and their subdomains) at the browser, so the page goes idle and captures stay deterministic. Blocked requests are excluded from the critical-resource retry check, since a deliberately blocked script firing requestfailed is not a load failure. Co-Authored-By: Claude Fable 5 --- README.md | 10 +++++++ reglance.example.json | 3 ++- src/capture.mjs | 61 ++++++++++++++++++++++++++++++++++++++++++- src/config.mjs | 45 +++++++++++++++++++++++++++++++ test/capture.test.mjs | 55 ++++++++++++++++++++++++++++++++++++++ test/config.test.mjs | 53 +++++++++++++++++++++++++++++++++++++ 6 files changed, 225 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 24cacc5..3e00f03 100644 --- a/README.md +++ b/README.md @@ -53,11 +53,21 @@ self-ignored `.reglance/` directory — nothing to add to `.gitignore`. | `output` | no | Output directory. Defaults to `.reglance`. | | `pixelmatchOptions` | no | [pixelmatch](https://github.com/mapbox/pixelmatch) options, e.g. `{ "threshold": 0.1 }`. | | `timeouts` | no | `{ goto, settle }` in ms. Navigation and post-scroll network-idle waits. Defaults `{ goto: 15000, settle: 8000 }`. Raise `settle` for slow, lazy-loading pages. | +| `blockHosts` | no | Hostnames to block requests to during capture, e.g. `["challenges.cloudflare.com"]`. Each entry also blocks its subdomains. | `domain` is only needed by `capture`; `control` and `compare` work on the files already captured. See [`reglance.example.json`](reglance.example.json) for a full example. +`blockHosts` aborts every request to the listed hosts (and their subdomains) +before it leaves the browser. Use it for third-party embeds that keep the +network busy and stall capture — CAPTCHA widgets like Cloudflare Turnstile, +ad tech, analytics — or that render differently on every load and pollute +diffs. Captures wait for the network to go idle, so a widget that polls or +retries indefinitely will otherwise time out every viewport on pages that +embed it. Entries are bare hostnames; `"kit.com"` blocks `kit.com` and +`pinchofyum.kit.com` alike. + A viewport's optional `deviceScaleFactor` (device pixel ratio) renders the page as it would appear on a higher-density display — use `2` for a retina capture, `3` for some phones. It defaults to `1`. Captures sharing a DPR run in one diff --git a/reglance.example.json b/reglance.example.json index f88c642..8426300 100644 --- a/reglance.example.json +++ b/reglance.example.json @@ -20,5 +20,6 @@ "timeouts": { "goto": 15000, "settle": 8000 - } + }, + "blockHosts": ["challenges.cloudflare.com"] } diff --git a/src/capture.mjs b/src/capture.mjs index 406e364..3cae16f 100644 --- a/src/capture.mjs +++ b/src/capture.mjs @@ -59,6 +59,38 @@ export function offDomainTargets(targets, domain) { }); } +/** + * Whether a URL's host is covered by the configured block list. + * + * An entry matches the host itself and all of its subdomains, so + * "kit.com" blocks both "kit.com" and "pinchofyum.kit.com". Non-network + * URLs (blob:, data:, chrome-extension:) have no hostname and never match. + * + * @param {string} url - The request URL. + * @param {string[]} blockHosts - Normalized (lowercase) hostnames. + * @returns {boolean} True when the request should be blocked. + */ +export function isBlockedHost(url, blockHosts) { + if (!blockHosts?.length) { + return false; + } + + let host; + try { + host = new URL(url).hostname.toLowerCase(); + } catch { + return false; + } + + if (!host) { + return false; + } + + return blockHosts.some( + (entry) => host === entry || host.endsWith(`.${entry}`) + ); +} + /** * Scroll the full height of the page and back to the top. * @@ -145,6 +177,7 @@ async function captureTarget(browser, target, viewports, dirs, options) { retryCount = 2, timeouts = DEFAULT_TIMEOUTS, ignoreHTTPSErrors = false, + blockHosts = [], } = options; const failures = []; let currentSlug = target.key; @@ -154,10 +187,26 @@ async function captureTarget(browser, target, viewports, dirs, options) { ignoreHTTPSErrors, deviceScaleFactor: group.deviceScaleFactor, }); + + if (blockHosts.length) { + await context.route('**/*', (route) => { + if (isBlockedHost(route.request().url(), blockHosts)) { + return route.abort('blockedbyclient'); + } + return route.continue(); + }); + } + const page = await context.newPage(); const failedResources = new Set(); page.on('requestfailed', (request) => { + // A deliberately blocked host is not a load failure — without this + // guard, blocking a third-party script would trigger the + // critical-resource retry on every attempt. + if (isBlockedHost(request.url(), blockHosts)) { + return; + } const type = request.resourceType(); if (type === 'stylesheet' || type === 'script') { failedResources.add(request.url()); @@ -317,6 +366,11 @@ export async function capture(config, options = {}) { console.log(`Viewports per target: ${config.viewports.length}`); console.log(`Total screenshots: ${totalShots}`); console.log(`Concurrency: ${concurrency} parallel contexts`); + if (config.blockHosts?.length) { + console.log( + `Blocking hosts (and subdomains): ${config.blockHosts.join(', ')}` + ); + } // Ignore TLS certificate errors only for local development hosts (where // self-signed certs are normal). For a non-local host, validation stays on @@ -370,7 +424,12 @@ export async function capture(config, options = {}) { target, config.viewports, config.dirs, - { skipReload, timeouts: config.timeouts, ignoreHTTPSErrors } + { + skipReload, + timeouts: config.timeouts, + ignoreHTTPSErrors, + blockHosts: config.blockHosts ?? [], + } ); failures.push(...targetFailures); } diff --git a/src/config.mjs b/src/config.mjs index 3120987..8ac5633 100644 --- a/src/config.mjs +++ b/src/config.mjs @@ -184,6 +184,50 @@ export function validateViewports(viewports) { }); } +/** + * Normalize and validate the `blockHosts` config value. + * + * Entries are bare hostnames; each blocks the host itself and all of its + * subdomains during capture (a leading `*.` is accepted and equivalent). + * Anything with a scheme, path, or port is rejected up front — silently + * matching nothing would read as "the block isn't working". + * + * @param {*} blockHosts - The raw config value. + * @returns {string[]} Lowercased hostnames with any `*.` prefix removed. + */ +export function normalizeBlockHosts(blockHosts) { + if (blockHosts === undefined) { + return []; + } + + if (!Array.isArray(blockHosts)) { + throw new Error( + '❌ Invalid "blockHosts": expected an array of hostnames.\n' + + '💡 Use entries like ["challenges.cloudflare.com"].' + ); + } + + return blockHosts.map((entry) => { + if (typeof entry !== 'string' || !entry.trim()) { + throw new Error( + `❌ Invalid "blockHosts" entry ${JSON.stringify(entry)}: expected a non-empty string.\n` + + '💡 Use a bare hostname like "challenges.cloudflare.com".' + ); + } + + const host = entry.trim().toLowerCase().replace(/^\*\./, ''); + + if (/[/:\s]/.test(host)) { + throw new Error( + `❌ Invalid "blockHosts" entry ${JSON.stringify(entry)}: expected a bare hostname (no scheme, port, or path).\n` + + '💡 Use "challenges.cloudflare.com", not "https://challenges.cloudflare.com/".' + ); + } + + return host; + }); +} + /** * Join a domain origin and a path into a full URL. * @@ -295,6 +339,7 @@ export function loadConfig({ configPath = 'reglance.json', domain } = {}) { viewports, targets, pixelmatchOptions, + blockHosts: normalizeBlockHosts(raw.blockHosts), timeouts: { ...DEFAULT_TIMEOUTS, ...raw.timeouts, diff --git a/test/capture.test.mjs b/test/capture.test.mjs index 81d2c21..f714d0f 100644 --- a/test/capture.test.mjs +++ b/test/capture.test.mjs @@ -5,6 +5,7 @@ import { isLocalHost, offDomainTargets, groupViewportsByScaleFactor, + isBlockedHost, } from '../src/capture.mjs'; const FAILURES = [ @@ -63,6 +64,60 @@ test('offDomainTargets returns nothing without a configured domain', () => { assert.deepEqual(offDomainTargets(targets, null), []); }); +test('isBlockedHost matches a listed host exactly', () => { + assert.equal( + isBlockedHost('https://challenges.cloudflare.com/turnstile/v0/api.js', [ + 'challenges.cloudflare.com', + ]), + true + ); +}); + +test('isBlockedHost matches subdomains of a listed host', () => { + assert.equal( + isBlockedHost('https://pinchofyum.kit.com/f83b/index.js', ['kit.com']), + true + ); +}); + +test('isBlockedHost does not match a host that merely ends with an entry', () => { + // "cloudflare.com" must not block "notcloudflare.com". + assert.equal( + isBlockedHost('https://notcloudflare.com/x.js', ['cloudflare.com']), + false + ); +}); + +test('isBlockedHost ignores case in the request host', () => { + assert.equal( + isBlockedHost('https://Challenges.Cloudflare.com/x', [ + 'challenges.cloudflare.com', + ]), + true + ); +}); + +test('isBlockedHost leaves unlisted hosts alone', () => { + assert.equal( + isBlockedHost('https://site.test/style.css', [ + 'challenges.cloudflare.com', + ]), + false + ); +}); + +test('isBlockedHost never matches URLs without a hostname', () => { + const blocked = ['challenges.cloudflare.com']; + assert.equal(isBlockedHost('blob:https://x.test/abc-123', blocked), false); + assert.equal(isBlockedHost('data:text/plain,hi', blocked), false); + assert.equal(isBlockedHost('not a url', blocked), false); +}); + +test('isBlockedHost is false for an empty or missing block list', () => { + assert.equal(isBlockedHost('https://x.test/', []), false); + assert.equal(isBlockedHost('https://x.test/', undefined), false); +}); + test('groupViewportsByScaleFactor defaults a missing DPR to 1', () => { const groups = groupViewportsByScaleFactor([ { name: 'desktop', width: 1920, height: 1080 }, diff --git a/test/config.test.mjs b/test/config.test.mjs index 002de0d..afa92a9 100644 --- a/test/config.test.mjs +++ b/test/config.test.mjs @@ -9,6 +9,7 @@ import { validateViewports, filterTargets, loadConfig, + normalizeBlockHosts, DEFAULT_PIXELMATCH_OPTIONS, } from '../src/config.mjs'; @@ -275,6 +276,58 @@ test('loadConfig merges configured timeouts over the defaults', () => { assert.equal(config.timeouts.goto, 15000); }); +test('normalizeBlockHosts defaults a missing value to an empty list', () => { + assert.deepEqual(normalizeBlockHosts(undefined), []); +}); + +test('normalizeBlockHosts lowercases entries and strips a *. prefix', () => { + assert.deepEqual( + normalizeBlockHosts(['Challenges.Cloudflare.com', '*.kit.com']), + ['challenges.cloudflare.com', 'kit.com'] + ); +}); + +test('normalizeBlockHosts rejects a non-array value', () => { + assert.throws( + () => normalizeBlockHosts('challenges.cloudflare.com'), + /Invalid "blockHosts"/ + ); +}); + +test('normalizeBlockHosts rejects empty and non-string entries', () => { + assert.throws(() => normalizeBlockHosts(['']), /non-empty string/); + assert.throws(() => normalizeBlockHosts([42]), /non-empty string/); +}); + +test('normalizeBlockHosts rejects entries with a scheme, port, or path', () => { + assert.throws( + () => normalizeBlockHosts(['https://challenges.cloudflare.com']), + /bare hostname/ + ); + assert.throws(() => normalizeBlockHosts(['kit.com/path']), /bare hostname/); + assert.throws(() => normalizeBlockHosts(['kit.com:8080']), /bare hostname/); +}); + +test('loadConfig defaults blockHosts to an empty list', () => { + const configPath = writeConfig({ + domain: 'site.test', + paths: { home: '/' }, + }); + assert.deepEqual(loadConfig({ configPath }).blockHosts, []); +}); + +test('loadConfig normalizes configured blockHosts', () => { + const configPath = writeConfig({ + domain: 'site.test', + paths: { home: '/' }, + blockHosts: ['*.Kit.com', 'challenges.cloudflare.com'], + }); + assert.deepEqual(loadConfig({ configPath }).blockHosts, [ + 'kit.com', + 'challenges.cloudflare.com', + ]); +}); + test('loadConfig rejects a non-array diffColor instead of crashing later', () => { const configPath = writeConfig({ domain: 'site.test', From 1f9a3382ef096b97ef12f801954a59e0e6ed947b Mon Sep 17 00:00:00 2001 From: jeremyfelt Date: Wed, 10 Jun 2026 11:11:40 -0700 Subject: [PATCH 2/2] Make image capture deterministic: stepwise scroll + explicit image wait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit autoScroll jumped straight to the bottom of the page, so lazy loaders (IntersectionObserver, native loading="lazy") never fired for anything in between — which images made it into a capture was a timing race, producing noisy diffs. It now steps one viewport at a time so every lazy image is triggered, re-reading the height as content grows. After the network-idle settle, capture now also waits (bounded by timeouts.settle) for every visible image to load and decode, and warns per capture when any image was still loading instead of silently shipping a partial screenshot. Hidden images are excluded: they cannot paint, and a hidden native-lazy image (e.g. a desktop-only image at a mobile width) never loads by design. Also generalizes hostname examples in docs and comments. Co-Authored-By: Claude Fable 5 --- README.md | 11 +++-- src/capture.mjs | 112 ++++++++++++++++++++++++++++++------------ test/capture.test.mjs | 2 +- 3 files changed, 89 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 3e00f03..1e7cc62 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ self-ignored `.reglance/` directory — nothing to add to `.gitignore`. | `viewports` | no | `[{ name, width, height, deviceScaleFactor? }]`. Defaults to `desktop` (1920×1080), `mobile` (390×844). | | `output` | no | Output directory. Defaults to `.reglance`. | | `pixelmatchOptions` | no | [pixelmatch](https://github.com/mapbox/pixelmatch) options, e.g. `{ "threshold": 0.1 }`. | -| `timeouts` | no | `{ goto, settle }` in ms. Navigation and post-scroll network-idle waits. Defaults `{ goto: 15000, settle: 8000 }`. Raise `settle` for slow, lazy-loading pages. | +| `timeouts` | no | `{ goto, settle }` in ms. `goto` bounds navigation; `settle` bounds each post-scroll wait (network idle, then image load/decode). Defaults `{ goto: 15000, settle: 8000 }`. Raise `settle` for slow, lazy-loading pages. | | `blockHosts` | no | Hostnames to block requests to during capture, e.g. `["challenges.cloudflare.com"]`. Each entry also blocks its subdomains. | `domain` is only needed by `capture`; `control` and `compare` work on the files @@ -65,8 +65,8 @@ network busy and stall capture — CAPTCHA widgets like Cloudflare Turnstile, ad tech, analytics — or that render differently on every load and pollute diffs. Captures wait for the network to go idle, so a widget that polls or retries indefinitely will otherwise time out every viewport on pages that -embed it. Entries are bare hostnames; `"kit.com"` blocks `kit.com` and -`pinchofyum.kit.com` alike. +embed it. Entries are bare hostnames; `"example.org"` blocks `example.org` and +`sub.example.org` alike. A viewport's optional `deviceScaleFactor` (device pixel ratio) renders the page as it would appear on a higher-density display — use `2` for a retina capture, @@ -124,6 +124,11 @@ guards against silently baselining bad data: - If a page never loads cleanly (after retries), `capture` reports it as degraded instead of treating it as a success. Add `--fail-on-degraded` to make the run exit non-zero in CI. +- `capture` scrolls each page one viewport at a time so every lazy-loaded + image is triggered, then waits (bounded by `timeouts.settle`) for all + images to load and decode before screenshotting — and warns per capture + when any image was still loading, instead of silently shipping a partial + screenshot. - `control` records each promotion in `.reglance/controls/manifest.json` and warns when it promoted fewer captures than expected (so the untouched controls are now stale). `compare` warns when the baseline mixes controls diff --git a/src/capture.mjs b/src/capture.mjs index 3cae16f..652703e 100644 --- a/src/capture.mjs +++ b/src/capture.mjs @@ -63,7 +63,7 @@ export function offDomainTargets(targets, domain) { * Whether a URL's host is covered by the configured block list. * * An entry matches the host itself and all of its subdomains, so - * "kit.com" blocks both "kit.com" and "pinchofyum.kit.com". Non-network + * "example.org" blocks both "example.org" and "sub.example.org". Non-network * URLs (blob:, data:, chrome-extension:) have no hostname and never match. * * @param {string} url - The request URL. @@ -95,34 +95,73 @@ export function isBlockedHost(url, blockHosts) { * Scroll the full height of the page and back to the top. * * Triggers lazy-loaded images and other on-scroll behavior so the screenshot - * captures the page as a visitor would see it. Scrolls to the bottom and waits - * for the page to grow, repeating until the height stabilizes — so a short page - * settles almost instantly while a tall one keeps going as content loads, - * rather than paying a fixed per-step delay across the whole height. + * captures the page as a visitor would see it. Steps one viewport at a time + * because lazy loaders (IntersectionObserver, native loading="lazy") only + * trigger for content near the viewport — jumping straight to the bottom + * skips everything in between, and which of those images load becomes a + * timing race. The height is re-read every step so content that loads and + * grows the page extends the walk. * * @param {import('playwright').Page} page - The page to scroll. */ async function autoScroll(page) { await page.evaluate(async () => { const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); - let lastHeight = -1; + const root = document.scrollingElement || document.documentElement; + let position = 0; + + // Cap iterations so a page can't loop forever. (e.g. infinite scroll) + for (let i = 0; i < 500; i++) { + position += window.innerHeight; + // 'instant' overrides a site's `scroll-behavior: smooth`, which + // would otherwise animate each step and outpace this loop. + window.scrollTo({ top: position, behavior: 'instant' }); + await sleep(50); - // Cap iterations so a page that grows on every scroll (infinite feed) - // can't loop forever. - for (let i = 0; i < 100; i++) { - const height = document.body.scrollHeight; - if (height === lastHeight) { + if (position >= root.scrollHeight - window.innerHeight) { break; } - lastHeight = height; - window.scrollTo(0, height); - await sleep(50); } - window.scrollTo(0, 0); + window.scrollTo({ top: 0, behavior: 'instant' }); }); } +/** + * Wait for every image on the page to finish loading and be ready to paint. + * + * The network-idle settle alone is not enough: it only covers requests that + * have already started, says nothing about decode state, and when it times + * out on a busy server the capture proceeds silently with whatever images + * happened to arrive. decode() resolves once an image is loaded and decoded; + * a broken image rejects, which counts as settled — a missing image is the + * page's actual state, not something to keep waiting on. + * + * @param {import('playwright').Page} page - The page to wait on. + * @param {number} timeout - Max wait in ms. + * @returns {Promise} How many images were still loading at timeout. + */ +async function waitForImages(page, timeout) { + return page.evaluate(async (maxWait) => { + const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); + + // Hidden images are excluded: they cannot paint into the screenshot, + // and a hidden native-lazy image (e.g. a desktop-only image at a + // mobile viewport width) never loads at all by design — waiting for + // it would burn the full timeout and warn about nothing. + const images = Array.from(document.images).filter((img) => + img.checkVisibility ? img.checkVisibility() : true + ); + + await Promise.race([ + Promise.all(images.map((img) => img.decode().catch(() => {}))), + sleep(maxWait), + ]); + + return images.filter((img) => !img.complete).length; + }, timeout); +} + /** * Group viewports by their device scale factor, preserving order. * @@ -164,11 +203,11 @@ export function groupViewportsByScaleFactor(viewports) { * groupViewportsByScaleFactor); within a context the first viewport navigates * fresh and the rest reuse the page (reloading unless --skip-reload). * - * @param {import('playwright').Browser} browser - The shared browser. - * @param {object} target - The target ({ key, url }). - * @param {Array} viewports - Viewport definitions. - * @param {object} dirs - Output directory paths. - * @param {object} options - Capture options. + * @param {import('playwright').Browser} browser The shared browser. + * @param {object} target The target ({ key, url }). + * @param {Array} viewports Viewport definitions. + * @param {object} dirs Output directory paths. + * @param {object} options Capture options. * @returns {Promise>} Failed slugs. */ async function captureTarget(browser, target, viewports, dirs, options) { @@ -201,9 +240,7 @@ async function captureTarget(browser, target, viewports, dirs, options) { const failedResources = new Set(); page.on('requestfailed', (request) => { - // A deliberately blocked host is not a load failure — without this - // guard, blocking a third-party script would trigger the - // critical-resource retry on every attempt. + // A deliberately blocked host is not a load failure. if (isBlockedHost(request.url(), blockHosts)) { return; } @@ -289,6 +326,17 @@ async function captureTarget(browser, target, viewports, dirs, options) { // Slow/never-idle page; screenshot what we have. }); + const pendingImages = await waitForImages( + page, + timeouts.settle + ); + if (pendingImages > 0) { + console.warn( + ` ⚠️ ${slug}: ${pendingImages} image(s) were still loading at capture ` + + '— the screenshot may be missing them. Raise "timeouts.settle" if this persists.' + ); + } + const imagePath = path.join(dirs.captures, `${slug}.png`); await page.screenshot({ path: imagePath, fullPage: true }); @@ -319,8 +367,8 @@ async function captureTarget(browser, target, viewports, dirs, options) { * opt-in (so existing best-effort/partial workflows keep working) — see the * D-004 decision. * - * @param {Array} failures - The degraded-slug records. - * @param {boolean} [failOnDegraded] - Whether degraded captures fail the run. + * @param {Array} failures The degraded-slug records. + * @param {boolean} [failOnDegraded] Whether degraded captures fail the run. * @returns {boolean} True when the run should signal failure. */ export function shouldFailRun(failures, failOnDegraded = false) { @@ -330,13 +378,13 @@ export function shouldFailRun(failures, failOnDegraded = false) { /** * Capture screenshots for every configured target. * - * @param {object} config - The normalized config. - * @param {object} [options] - Capture options. - * @param {number} [options.concurrency] - Parallel browser contexts. - * @param {number} [options.staggerDelay] - Delay (ms) between context starts. - * @param {boolean}[options.skipReload] - Reuse the page between viewports. - * @param {boolean}[options.failOnDegraded] - Exit non-zero if any capture is degraded. - * @param {Array} [options.only] - Limit to these target keys. + * @param {object} config The normalized config. + * @param {object} [options] Capture options. + * @param {number} [options.concurrency] Parallel browser contexts. + * @param {number} [options.staggerDelay] Delay (ms) between context starts. + * @param {boolean}[options.skipReload] Reuse the page between viewports. + * @param {boolean}[options.failOnDegraded] Exit non-zero if any capture is degraded. + * @param {Array} [options.only] Limit to these target keys. * @returns {Promise<{ failures: Array }>} The degraded-slug records. */ export async function capture(config, options = {}) { diff --git a/test/capture.test.mjs b/test/capture.test.mjs index f714d0f..378c9c6 100644 --- a/test/capture.test.mjs +++ b/test/capture.test.mjs @@ -75,7 +75,7 @@ test('isBlockedHost matches a listed host exactly', () => { test('isBlockedHost matches subdomains of a listed host', () => { assert.equal( - isBlockedHost('https://pinchofyum.kit.com/f83b/index.js', ['kit.com']), + isBlockedHost('https://sub.example.org/f83b/index.js', ['example.org']), true ); });