diff --git a/README.md b/README.md index 24cacc5..1e7cc62 100644 --- a/README.md +++ b/README.md @@ -52,12 +52,22 @@ self-ignored `.reglance/` directory — nothing to add to `.gitignore`. | `viewports` | no | `[{ name, width, height, deviceScaleFactor? }]`. Defaults to `desktop` (1920×1080), `mobile` (390×844). | | `output` | no | Output directory. Defaults to `.reglance`. | | `pixelmatchOptions` | no | [pixelmatch](https://github.com/mapbox/pixelmatch) options, e.g. `{ "threshold": 0.1 }`. | -| `timeouts` | no | `{ goto, settle }` in ms. Navigation and post-scroll network-idle waits. Defaults `{ goto: 15000, settle: 8000 }`. Raise `settle` for slow, lazy-loading pages. | +| `timeouts` | no | `{ goto, settle }` in ms. `goto` bounds navigation; `settle` bounds each post-scroll wait (network idle, then image load/decode). Defaults `{ goto: 15000, settle: 8000 }`. Raise `settle` for slow, lazy-loading pages. | +| `blockHosts` | no | Hostnames to block requests to during capture, e.g. `["challenges.cloudflare.com"]`. Each entry also blocks its subdomains. | `domain` is only needed by `capture`; `control` and `compare` work on the files already captured. See [`reglance.example.json`](reglance.example.json) for a full example. +`blockHosts` aborts every request to the listed hosts (and their subdomains) +before it leaves the browser. Use it for third-party embeds that keep the +network busy and stall capture — CAPTCHA widgets like Cloudflare Turnstile, +ad tech, analytics — or that render differently on every load and pollute +diffs. Captures wait for the network to go idle, so a widget that polls or +retries indefinitely will otherwise time out every viewport on pages that +embed it. Entries are bare hostnames; `"example.org"` blocks `example.org` and +`sub.example.org` alike. + A viewport's optional `deviceScaleFactor` (device pixel ratio) renders the page as it would appear on a higher-density display — use `2` for a retina capture, `3` for some phones. It defaults to `1`. Captures sharing a DPR run in one @@ -114,6 +124,11 @@ guards against silently baselining bad data: - If a page never loads cleanly (after retries), `capture` reports it as degraded instead of treating it as a success. Add `--fail-on-degraded` to make the run exit non-zero in CI. +- `capture` scrolls each page one viewport at a time so every lazy-loaded + image is triggered, then waits (bounded by `timeouts.settle`) for all + images to load and decode before screenshotting — and warns per capture + when any image was still loading, instead of silently shipping a partial + screenshot. - `control` records each promotion in `.reglance/controls/manifest.json` and warns when it promoted fewer captures than expected (so the untouched controls are now stale). `compare` warns when the baseline mixes controls diff --git a/reglance.example.json b/reglance.example.json index f88c642..8426300 100644 --- a/reglance.example.json +++ b/reglance.example.json @@ -20,5 +20,6 @@ "timeouts": { "goto": 15000, "settle": 8000 - } + }, + "blockHosts": ["challenges.cloudflare.com"] } diff --git a/src/capture.mjs b/src/capture.mjs index 406e364..652703e 100644 --- a/src/capture.mjs +++ b/src/capture.mjs @@ -59,38 +59,109 @@ export function offDomainTargets(targets, domain) { }); } +/** + * Whether a URL's host is covered by the configured block list. + * + * An entry matches the host itself and all of its subdomains, so + * "example.org" blocks both "example.org" and "sub.example.org". Non-network + * URLs (blob:, data:, chrome-extension:) have no hostname and never match. + * + * @param {string} url - The request URL. + * @param {string[]} blockHosts - Normalized (lowercase) hostnames. + * @returns {boolean} True when the request should be blocked. + */ +export function isBlockedHost(url, blockHosts) { + if (!blockHosts?.length) { + return false; + } + + let host; + try { + host = new URL(url).hostname.toLowerCase(); + } catch { + return false; + } + + if (!host) { + return false; + } + + return blockHosts.some( + (entry) => host === entry || host.endsWith(`.${entry}`) + ); +} + /** * Scroll the full height of the page and back to the top. * * Triggers lazy-loaded images and other on-scroll behavior so the screenshot - * captures the page as a visitor would see it. Scrolls to the bottom and waits - * for the page to grow, repeating until the height stabilizes — so a short page - * settles almost instantly while a tall one keeps going as content loads, - * rather than paying a fixed per-step delay across the whole height. + * captures the page as a visitor would see it. Steps one viewport at a time + * because lazy loaders (IntersectionObserver, native loading="lazy") only + * trigger for content near the viewport — jumping straight to the bottom + * skips everything in between, and which of those images load becomes a + * timing race. The height is re-read every step so content that loads and + * grows the page extends the walk. * * @param {import('playwright').Page} page - The page to scroll. */ async function autoScroll(page) { await page.evaluate(async () => { const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); - let lastHeight = -1; + const root = document.scrollingElement || document.documentElement; + let position = 0; + + // Cap iterations so a page can't loop forever. (e.g. infinite scroll) + for (let i = 0; i < 500; i++) { + position += window.innerHeight; + // 'instant' overrides a site's `scroll-behavior: smooth`, which + // would otherwise animate each step and outpace this loop. + window.scrollTo({ top: position, behavior: 'instant' }); + await sleep(50); - // Cap iterations so a page that grows on every scroll (infinite feed) - // can't loop forever. - for (let i = 0; i < 100; i++) { - const height = document.body.scrollHeight; - if (height === lastHeight) { + if (position >= root.scrollHeight - window.innerHeight) { break; } - lastHeight = height; - window.scrollTo(0, height); - await sleep(50); } - window.scrollTo(0, 0); + window.scrollTo({ top: 0, behavior: 'instant' }); }); } +/** + * Wait for every image on the page to finish loading and be ready to paint. + * + * The network-idle settle alone is not enough: it only covers requests that + * have already started, says nothing about decode state, and when it times + * out on a busy server the capture proceeds silently with whatever images + * happened to arrive. decode() resolves once an image is loaded and decoded; + * a broken image rejects, which counts as settled — a missing image is the + * page's actual state, not something to keep waiting on. + * + * @param {import('playwright').Page} page - The page to wait on. + * @param {number} timeout - Max wait in ms. + * @returns {Promise} How many images were still loading at timeout. + */ +async function waitForImages(page, timeout) { + return page.evaluate(async (maxWait) => { + const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); + + // Hidden images are excluded: they cannot paint into the screenshot, + // and a hidden native-lazy image (e.g. a desktop-only image at a + // mobile viewport width) never loads at all by design — waiting for + // it would burn the full timeout and warn about nothing. + const images = Array.from(document.images).filter((img) => + img.checkVisibility ? img.checkVisibility() : true + ); + + await Promise.race([ + Promise.all(images.map((img) => img.decode().catch(() => {}))), + sleep(maxWait), + ]); + + return images.filter((img) => !img.complete).length; + }, timeout); +} + /** * Group viewports by their device scale factor, preserving order. * @@ -132,11 +203,11 @@ export function groupViewportsByScaleFactor(viewports) { * groupViewportsByScaleFactor); within a context the first viewport navigates * fresh and the rest reuse the page (reloading unless --skip-reload). * - * @param {import('playwright').Browser} browser - The shared browser. - * @param {object} target - The target ({ key, url }). - * @param {Array} viewports - Viewport definitions. - * @param {object} dirs - Output directory paths. - * @param {object} options - Capture options. + * @param {import('playwright').Browser} browser The shared browser. + * @param {object} target The target ({ key, url }). + * @param {Array} viewports Viewport definitions. + * @param {object} dirs Output directory paths. + * @param {object} options Capture options. * @returns {Promise>} Failed slugs. */ async function captureTarget(browser, target, viewports, dirs, options) { @@ -145,6 +216,7 @@ async function captureTarget(browser, target, viewports, dirs, options) { retryCount = 2, timeouts = DEFAULT_TIMEOUTS, ignoreHTTPSErrors = false, + blockHosts = [], } = options; const failures = []; let currentSlug = target.key; @@ -154,10 +226,24 @@ async function captureTarget(browser, target, viewports, dirs, options) { ignoreHTTPSErrors, deviceScaleFactor: group.deviceScaleFactor, }); + + if (blockHosts.length) { + await context.route('**/*', (route) => { + if (isBlockedHost(route.request().url(), blockHosts)) { + return route.abort('blockedbyclient'); + } + return route.continue(); + }); + } + const page = await context.newPage(); const failedResources = new Set(); page.on('requestfailed', (request) => { + // A deliberately blocked host is not a load failure. + if (isBlockedHost(request.url(), blockHosts)) { + return; + } const type = request.resourceType(); if (type === 'stylesheet' || type === 'script') { failedResources.add(request.url()); @@ -240,6 +326,17 @@ async function captureTarget(browser, target, viewports, dirs, options) { // Slow/never-idle page; screenshot what we have. }); + const pendingImages = await waitForImages( + page, + timeouts.settle + ); + if (pendingImages > 0) { + console.warn( + ` ⚠️ ${slug}: ${pendingImages} image(s) were still loading at capture ` + + '— the screenshot may be missing them. Raise "timeouts.settle" if this persists.' + ); + } + const imagePath = path.join(dirs.captures, `${slug}.png`); await page.screenshot({ path: imagePath, fullPage: true }); @@ -270,8 +367,8 @@ async function captureTarget(browser, target, viewports, dirs, options) { * opt-in (so existing best-effort/partial workflows keep working) — see the * D-004 decision. * - * @param {Array} failures - The degraded-slug records. - * @param {boolean} [failOnDegraded] - Whether degraded captures fail the run. + * @param {Array} failures The degraded-slug records. + * @param {boolean} [failOnDegraded] Whether degraded captures fail the run. * @returns {boolean} True when the run should signal failure. */ export function shouldFailRun(failures, failOnDegraded = false) { @@ -281,13 +378,13 @@ export function shouldFailRun(failures, failOnDegraded = false) { /** * Capture screenshots for every configured target. * - * @param {object} config - The normalized config. - * @param {object} [options] - Capture options. - * @param {number} [options.concurrency] - Parallel browser contexts. - * @param {number} [options.staggerDelay] - Delay (ms) between context starts. - * @param {boolean}[options.skipReload] - Reuse the page between viewports. - * @param {boolean}[options.failOnDegraded] - Exit non-zero if any capture is degraded. - * @param {Array} [options.only] - Limit to these target keys. + * @param {object} config The normalized config. + * @param {object} [options] Capture options. + * @param {number} [options.concurrency] Parallel browser contexts. + * @param {number} [options.staggerDelay] Delay (ms) between context starts. + * @param {boolean}[options.skipReload] Reuse the page between viewports. + * @param {boolean}[options.failOnDegraded] Exit non-zero if any capture is degraded. + * @param {Array} [options.only] Limit to these target keys. * @returns {Promise<{ failures: Array }>} The degraded-slug records. */ export async function capture(config, options = {}) { @@ -317,6 +414,11 @@ export async function capture(config, options = {}) { console.log(`Viewports per target: ${config.viewports.length}`); console.log(`Total screenshots: ${totalShots}`); console.log(`Concurrency: ${concurrency} parallel contexts`); + if (config.blockHosts?.length) { + console.log( + `Blocking hosts (and subdomains): ${config.blockHosts.join(', ')}` + ); + } // Ignore TLS certificate errors only for local development hosts (where // self-signed certs are normal). For a non-local host, validation stays on @@ -370,7 +472,12 @@ export async function capture(config, options = {}) { target, config.viewports, config.dirs, - { skipReload, timeouts: config.timeouts, ignoreHTTPSErrors } + { + skipReload, + timeouts: config.timeouts, + ignoreHTTPSErrors, + blockHosts: config.blockHosts ?? [], + } ); failures.push(...targetFailures); } diff --git a/src/config.mjs b/src/config.mjs index 3120987..8ac5633 100644 --- a/src/config.mjs +++ b/src/config.mjs @@ -184,6 +184,50 @@ export function validateViewports(viewports) { }); } +/** + * Normalize and validate the `blockHosts` config value. + * + * Entries are bare hostnames; each blocks the host itself and all of its + * subdomains during capture (a leading `*.` is accepted and equivalent). + * Anything with a scheme, path, or port is rejected up front — silently + * matching nothing would read as "the block isn't working". + * + * @param {*} blockHosts - The raw config value. + * @returns {string[]} Lowercased hostnames with any `*.` prefix removed. + */ +export function normalizeBlockHosts(blockHosts) { + if (blockHosts === undefined) { + return []; + } + + if (!Array.isArray(blockHosts)) { + throw new Error( + '❌ Invalid "blockHosts": expected an array of hostnames.\n' + + '💡 Use entries like ["challenges.cloudflare.com"].' + ); + } + + return blockHosts.map((entry) => { + if (typeof entry !== 'string' || !entry.trim()) { + throw new Error( + `❌ Invalid "blockHosts" entry ${JSON.stringify(entry)}: expected a non-empty string.\n` + + '💡 Use a bare hostname like "challenges.cloudflare.com".' + ); + } + + const host = entry.trim().toLowerCase().replace(/^\*\./, ''); + + if (/[/:\s]/.test(host)) { + throw new Error( + `❌ Invalid "blockHosts" entry ${JSON.stringify(entry)}: expected a bare hostname (no scheme, port, or path).\n` + + '💡 Use "challenges.cloudflare.com", not "https://challenges.cloudflare.com/".' + ); + } + + return host; + }); +} + /** * Join a domain origin and a path into a full URL. * @@ -295,6 +339,7 @@ export function loadConfig({ configPath = 'reglance.json', domain } = {}) { viewports, targets, pixelmatchOptions, + blockHosts: normalizeBlockHosts(raw.blockHosts), timeouts: { ...DEFAULT_TIMEOUTS, ...raw.timeouts, diff --git a/test/capture.test.mjs b/test/capture.test.mjs index 81d2c21..378c9c6 100644 --- a/test/capture.test.mjs +++ b/test/capture.test.mjs @@ -5,6 +5,7 @@ import { isLocalHost, offDomainTargets, groupViewportsByScaleFactor, + isBlockedHost, } from '../src/capture.mjs'; const FAILURES = [ @@ -63,6 +64,60 @@ test('offDomainTargets returns nothing without a configured domain', () => { assert.deepEqual(offDomainTargets(targets, null), []); }); +test('isBlockedHost matches a listed host exactly', () => { + assert.equal( + isBlockedHost('https://challenges.cloudflare.com/turnstile/v0/api.js', [ + 'challenges.cloudflare.com', + ]), + true + ); +}); + +test('isBlockedHost matches subdomains of a listed host', () => { + assert.equal( + isBlockedHost('https://sub.example.org/f83b/index.js', ['example.org']), + true + ); +}); + +test('isBlockedHost does not match a host that merely ends with an entry', () => { + // "cloudflare.com" must not block "notcloudflare.com". + assert.equal( + isBlockedHost('https://notcloudflare.com/x.js', ['cloudflare.com']), + false + ); +}); + +test('isBlockedHost ignores case in the request host', () => { + assert.equal( + isBlockedHost('https://Challenges.Cloudflare.com/x', [ + 'challenges.cloudflare.com', + ]), + true + ); +}); + +test('isBlockedHost leaves unlisted hosts alone', () => { + assert.equal( + isBlockedHost('https://site.test/style.css', [ + 'challenges.cloudflare.com', + ]), + false + ); +}); + +test('isBlockedHost never matches URLs without a hostname', () => { + const blocked = ['challenges.cloudflare.com']; + assert.equal(isBlockedHost('blob:https://x.test/abc-123', blocked), false); + assert.equal(isBlockedHost('data:text/plain,hi', blocked), false); + assert.equal(isBlockedHost('not a url', blocked), false); +}); + +test('isBlockedHost is false for an empty or missing block list', () => { + assert.equal(isBlockedHost('https://x.test/', []), false); + assert.equal(isBlockedHost('https://x.test/', undefined), false); +}); + test('groupViewportsByScaleFactor defaults a missing DPR to 1', () => { const groups = groupViewportsByScaleFactor([ { name: 'desktop', width: 1920, height: 1080 }, diff --git a/test/config.test.mjs b/test/config.test.mjs index 002de0d..afa92a9 100644 --- a/test/config.test.mjs +++ b/test/config.test.mjs @@ -9,6 +9,7 @@ import { validateViewports, filterTargets, loadConfig, + normalizeBlockHosts, DEFAULT_PIXELMATCH_OPTIONS, } from '../src/config.mjs'; @@ -275,6 +276,58 @@ test('loadConfig merges configured timeouts over the defaults', () => { assert.equal(config.timeouts.goto, 15000); }); +test('normalizeBlockHosts defaults a missing value to an empty list', () => { + assert.deepEqual(normalizeBlockHosts(undefined), []); +}); + +test('normalizeBlockHosts lowercases entries and strips a *. prefix', () => { + assert.deepEqual( + normalizeBlockHosts(['Challenges.Cloudflare.com', '*.kit.com']), + ['challenges.cloudflare.com', 'kit.com'] + ); +}); + +test('normalizeBlockHosts rejects a non-array value', () => { + assert.throws( + () => normalizeBlockHosts('challenges.cloudflare.com'), + /Invalid "blockHosts"/ + ); +}); + +test('normalizeBlockHosts rejects empty and non-string entries', () => { + assert.throws(() => normalizeBlockHosts(['']), /non-empty string/); + assert.throws(() => normalizeBlockHosts([42]), /non-empty string/); +}); + +test('normalizeBlockHosts rejects entries with a scheme, port, or path', () => { + assert.throws( + () => normalizeBlockHosts(['https://challenges.cloudflare.com']), + /bare hostname/ + ); + assert.throws(() => normalizeBlockHosts(['kit.com/path']), /bare hostname/); + assert.throws(() => normalizeBlockHosts(['kit.com:8080']), /bare hostname/); +}); + +test('loadConfig defaults blockHosts to an empty list', () => { + const configPath = writeConfig({ + domain: 'site.test', + paths: { home: '/' }, + }); + assert.deepEqual(loadConfig({ configPath }).blockHosts, []); +}); + +test('loadConfig normalizes configured blockHosts', () => { + const configPath = writeConfig({ + domain: 'site.test', + paths: { home: '/' }, + blockHosts: ['*.Kit.com', 'challenges.cloudflare.com'], + }); + assert.deepEqual(loadConfig({ configPath }).blockHosts, [ + 'kit.com', + 'challenges.cloudflare.com', + ]); +}); + test('loadConfig rejects a non-array diffColor instead of crashing later', () => { const configPath = writeConfig({ domain: 'site.test',