From a6bc35d14cbd118d35d5ecb44f48e2c883018870 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 25 Sep 2025 14:25:48 -0400 Subject: [PATCH 1/9] Add CLI option to respect robots.txt disallows When enabled, the new --robots flag will result in the crawler fetching robots.txt for each page origin, cacheing in Redis by URL to avoid duplicate fetches, and checking if URLs are allowed by the policies therein before queueing. --- package.json | 1 + src/crawler.ts | 107 ++++++++++++++++++++++++++++++++++++++++++ src/util/argParser.ts | 7 +++ src/util/logger.ts | 1 + src/util/state.ts | 14 ++++++ 5 files changed, 130 insertions(+) diff --git a/package.json b/package.json index c009b0f66..c93139eea 100644 --- a/package.json +++ b/package.json @@ -34,6 +34,7 @@ "pixelmatch": "^5.3.0", "pngjs": "^7.0.0", "puppeteer-core": "^24.30.0", + "robots-parser": "^3.0.1", "sax": "^1.3.0", "sharp": "^0.32.6", "tsc": "^2.0.4", diff --git a/src/crawler.ts b/src/crawler.ts index 10501496b..13d860ecc 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -3,6 +3,8 @@ import path from "path"; import fs, { WriteStream } from "fs"; import os from "os"; import fsp from "fs/promises"; +import { fetch as undiciFetch } from "undici"; +import robotsParser, { Robot } from "robots-parser"; import { RedisCrawlState, @@ -36,6 +38,7 @@ import { logger, formatErr, LogDetails, LogContext } from "./util/logger.js"; import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js"; import { sleep, timedRun, secondsElapsed } from "./util/timing.js"; import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js"; +import { getProxyDispatcher } from "./util/proxy.js"; import { Browser } from "./util/browser.js"; @@ -1265,6 +1268,98 @@ self.__bx_behaviors.selectMainBehavior(); } } + async _fetchRobots(url: string) { + while (true) { + const resp = await undiciFetch(url, { + headers: this.headers, + dispatcher: getProxyDispatcher(url), + }); + + if (resp.ok) { + return resp; + } + + const retry = resp.headers.get("retry-after"); + + if (retry) { + logger.debug( + "Robots.txt fetch: Retry after", + { url, retrySeconds: retry }, + "robots", + ); + await sleep(parseInt(retry)); + continue; + } + + logger.debug( + "Robots.txt not fetched", + { url, status: resp.status }, + "robots", + ); + return null; + } + return null; + } + + async fetchAndParseRobots( + url: string, + logDetails: LogDetails, + ): Promise { + // Fetch robots.txt for url's host and return parser, caching robots + // bodies in Redis by their URL + // TODO: Consider using an LRU cache/only cache so many robots responses + // in Redis at one time and re-fetch if no longer in cache to avoid + // exhausting memory on very large crawls across many hosts + const urlParser = new URL(url); + const robotsUrl = `${urlParser.origin}/robots.txt`; + + const cachedRobots = await this.crawlState.getCachedRobots(robotsUrl); + if (cachedRobots) { + logger.debug( + "Using cached robots.txt body", + { + url: robotsUrl, + ...logDetails, + }, + "robots", + ); + return robotsParser(robotsUrl, cachedRobots); + } + + try { + logger.debug( + "Fetching robots.txt", + { url: robotsUrl, ...logDetails }, + "robots", + ); + const resp = await this._fetchRobots(robotsUrl); + if (!resp) { + return null; + } + const content = await resp.text(); + + logger.debug( + "Caching robots.txt body", + { url: robotsUrl, ...logDetails }, + "robots", + ); + await this.crawlState.setCachedRobots(robotsUrl, content); + + return robotsParser(robotsUrl, content); + } catch (e) { + // ignore + } + logger.warn( + "Failed to fetch robots.txt", + { + url: robotsUrl, + ...logDetails, + }, + "robots", + ); + return null; + } + async awaitPageExtraDelay(opts: WorkerState) { if (this.params.pageExtraDelay) { const { @@ -2506,6 +2601,18 @@ self.__bx_behaviors.selectMainBehavior(); return false; } + if (this.params.robots) { + const robots = await this.fetchAndParseRobots(url, logDetails); + if (robots && robots.isDisallowed(url, "Browsertrix/1.0")) { + logger.debug( + "Page URL not queued, disallowed by robots.txt", + { url, ...logDetails }, + "links", + ); + return false; + } + } + const result = await this.crawlState.addToQueue( { url, seedId, depth, extraHops, ts, pageid }, this.pageLimit, diff --git a/src/util/argParser.ts b/src/util/argParser.ts index 4e5e07c32..e84a2a5e4 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -704,6 +704,13 @@ class ArgParser { type: "array", default: [], }, + + robots: { + describe: + "If set, fetch and respect page disallows specified in per-host robots.txt", + type: "boolean", + default: false, + }, }); } diff --git a/src/util/logger.ts b/src/util/logger.ts index 7d10939ee..4842aa226 100644 --- a/src/util/logger.ts +++ b/src/util/logger.ts @@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [ "replay", "proxy", "scope", + "robots", ] as const; export type LogContext = (typeof LOG_CONTEXT_TYPES)[number]; diff --git a/src/util/state.ts b/src/util/state.ts index 8960d9b60..e933c3826 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -200,7 +200,9 @@ export class RedisCrawlState { fkey: string; ekey: string; bkey: string; + rkey: string; pageskey: string; + esKey: string; esMap: string; @@ -233,6 +235,8 @@ export class RedisCrawlState { this.ekey = this.key + ":e"; // crawler behavior script messages this.bkey = this.key + ":b"; + // cached robots.txt bodies (per-origin) + this.rkey = this.key + ":r"; // pages this.pageskey = this.key + ":pages"; @@ -1025,6 +1029,16 @@ return inx; return await this.redis.lpush(this.bkey, behaviorLog); } + async setCachedRobots(robotsUrl: string, body: string) { + const urlKey = `${this.rkey}:${robotsUrl}`; + return await this.redis.set(urlKey, body); + } + + async getCachedRobots(robotsUrl: string) { + const urlKey = `${this.rkey}:${robotsUrl}`; + return await this.redis.get(urlKey); + } + async writeToPagesQueue( data: Record, ) { From 547d645593e5a94905730b66578deefd2a44cad7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 29 Sep 2025 17:46:20 -0400 Subject: [PATCH 2/9] Implement LRU cache with limit of 100 robots.txt bodies --- src/crawler.ts | 8 +++----- src/util/constants.ts | 2 ++ src/util/state.ts | 34 +++++++++++++++++++++++++++++----- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 13d860ecc..016bc3a18 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1305,11 +1305,9 @@ self.__bx_behaviors.selectMainBehavior(); url: string, logDetails: LogDetails, ): Promise { - // Fetch robots.txt for url's host and return parser, caching robots - // bodies in Redis by their URL - // TODO: Consider using an LRU cache/only cache so many robots responses - // in Redis at one time and re-fetch if no longer in cache to avoid - // exhausting memory on very large crawls across many hosts + // Fetch robots.txt for url's host and return parser. + // Results are cached by robots.txt URL in Redis using an LRU cache + // implementation that retains the 100 most recently used values. const urlParser = new URL(url); const robotsUrl = `${urlParser.origin}/robots.txt`; diff --git a/src/util/constants.ts b/src/util/constants.ts index 15b00bd70..ebf83c571 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -41,6 +41,8 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30; export const PAGE_OP_TIMEOUT_SECS = 5; export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30; +export const ROBOTS_CACHE_LIMIT = 100; + export type ExtractSelector = { selector: string; extract: string; diff --git a/src/util/state.ts b/src/util/state.ts index e933c3826..290152224 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -3,7 +3,11 @@ import { v4 as uuidv4 } from "uuid"; import { logger } from "./logger.js"; -import { MAX_DEPTH, DEFAULT_MAX_RETRIES } from "./constants.js"; +import { + MAX_DEPTH, + DEFAULT_MAX_RETRIES, + ROBOTS_CACHE_LIMIT, +} from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; import { interpolateFilename, UploadResult } from "./storage.js"; @@ -201,6 +205,7 @@ export class RedisCrawlState { ekey: string; bkey: string; rkey: string; + lkey: string; pageskey: string; esKey: string; @@ -237,6 +242,8 @@ export class RedisCrawlState { this.bkey = this.key + ":b"; // cached robots.txt bodies (per-origin) this.rkey = this.key + ":r"; + // LRU cache of robots.txt keys + this.lkey = this.key + ":l"; // pages this.pageskey = this.key + ":pages"; @@ -1029,14 +1036,31 @@ return inx; return await this.redis.lpush(this.bkey, behaviorLog); } + async _updateRobotsAccessTime(robotsUrl: string) { + const accessTime = Date.now(); + await this.redis.zadd(this.lkey, accessTime, robotsUrl); + } + async setCachedRobots(robotsUrl: string, body: string) { - const urlKey = `${this.rkey}:${robotsUrl}`; - return await this.redis.set(urlKey, body); + await this._updateRobotsAccessTime(robotsUrl); + await this.redis.set(`${this.rkey}:${robotsUrl}`, body); + + // prune least-recently used items in zset and robots cache if over limit + const cacheCount = await this.redis.zcard(this.lkey); + if (cacheCount > ROBOTS_CACHE_LIMIT) { + const diff = cacheCount - ROBOTS_CACHE_LIMIT; + const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1); + + for (const keyToDelete of keysToDelete) { + await this.redis.del(`${this.rkey}:${keyToDelete}`); + await this.redis.zrem(this.lkey, keyToDelete); + } + } } async getCachedRobots(robotsUrl: string) { - const urlKey = `${this.rkey}:${robotsUrl}`; - return await this.redis.get(urlKey); + await this._updateRobotsAccessTime(robotsUrl); + return await this.redis.get(`${this.rkey}:${robotsUrl}`); } async writeToPagesQueue( From 0a3ef304e064dcbf3743055618df1fcc2645bc1e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 30 Sep 2025 10:42:57 -0400 Subject: [PATCH 3/9] Add debug log line for deleting cached robots --- src/util/state.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/util/state.ts b/src/util/state.ts index 290152224..3df430fc1 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -1052,6 +1052,11 @@ return inx; const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1); for (const keyToDelete of keysToDelete) { + logger.debug( + "Deleting cached robots.txt, over cache limit", + { url: keyToDelete }, + "robots", + ); await this.redis.del(`${this.rkey}:${keyToDelete}`); await this.redis.zrem(this.lkey, keyToDelete); } From ae81f479d1c57e5e6ba54c41c75b00dbb06fa2f6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 30 Sep 2025 11:08:00 -0400 Subject: [PATCH 4/9] Add tests for robots.txt being fetched and cached Does not yet include testing that a page URL disallowed by robots is not queued, as I haven't yet been able to find a Webrecorder- managed site with a robots.txt with disallows to test against. --- tests/robots_txt.test.js | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 tests/robots_txt.test.js diff --git a/tests/robots_txt.test.js b/tests/robots_txt.test.js new file mode 100644 index 000000000..43ffe1975 --- /dev/null +++ b/tests/robots_txt.test.js @@ -0,0 +1,35 @@ +import child_process from "child_process"; + +test("test robots.txt is fetched and cached", async () => { + const res = child_process.execSync( + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug", + ); + + const log = res.toString(); + + // robots.txt not found + expect( + log.indexOf( + '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}', + ) > 0, + ).toBe(true); + + expect( + log.indexOf( + '"logLevel":"debug","context":"robots","message":"Robots.txt not fetched","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}', + ) > 0, + ).toBe(true); + + // robots.txt found and cached + expect( + log.indexOf( + '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}', + ) > 0, + ).toBe(true); + + expect( + log.indexOf( + '"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}', + ) > 0, + ).toBe(true); +}); From e9e373883194bc901452601c6e9834e11d4bd2e1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 25 Nov 2025 22:10:11 -0800 Subject: [PATCH 5/9] minor cleanup: - move robots logic to ./src/utils/robots.ts - add --robotsAgent arg, defaulting to Browsertrix/1.x - remove logging for 'using cached robots' as its too verbose (for every link) - cache empty "" robots responses and treat as allow all - treat non-200 and non-429 and non-503 responses as empty "" --- src/crawler.ts | 118 ++++++------------------------------------ src/util/argParser.ts | 6 +++ src/util/robots.ts | 116 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+), 103 deletions(-) create mode 100644 src/util/robots.ts diff --git a/src/crawler.ts b/src/crawler.ts index 016bc3a18..183166c93 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -3,8 +3,6 @@ import path from "path"; import fs, { WriteStream } from "fs"; import os from "os"; import fsp from "fs/promises"; -import { fetch as undiciFetch } from "undici"; -import robotsParser, { Robot } from "robots-parser"; import { RedisCrawlState, @@ -38,7 +36,6 @@ import { logger, formatErr, LogDetails, LogContext } from "./util/logger.js"; import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js"; import { sleep, timedRun, secondsElapsed } from "./util/timing.js"; import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js"; -import { getProxyDispatcher } from "./util/proxy.js"; import { Browser } from "./util/browser.js"; @@ -75,6 +72,7 @@ import { import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js"; import { initProxy } from "./util/proxy.js"; import { initFlow, nextFlowStep } from "./util/flowbehavior.js"; +import { isDisallowedByRobots, setRobotsConfig } from "./util/robots.js"; const btrixBehaviors = fs.readFileSync( new URL( @@ -550,6 +548,10 @@ export class Crawler { this.headers = { "User-Agent": this.configureUA() }; + if (this.params.robots) { + setRobotsConfig(this.headers, this.crawlState); + } + process.on("exit", () => { for (const proc of subprocesses) { proc.kill(); @@ -1268,96 +1270,6 @@ self.__bx_behaviors.selectMainBehavior(); } } - async _fetchRobots(url: string) { - while (true) { - const resp = await undiciFetch(url, { - headers: this.headers, - dispatcher: getProxyDispatcher(url), - }); - - if (resp.ok) { - return resp; - } - - const retry = resp.headers.get("retry-after"); - - if (retry) { - logger.debug( - "Robots.txt fetch: Retry after", - { url, retrySeconds: retry }, - "robots", - ); - await sleep(parseInt(retry)); - continue; - } - - logger.debug( - "Robots.txt not fetched", - { url, status: resp.status }, - "robots", - ); - return null; - } - return null; - } - - async fetchAndParseRobots( - url: string, - logDetails: LogDetails, - ): Promise { - // Fetch robots.txt for url's host and return parser. - // Results are cached by robots.txt URL in Redis using an LRU cache - // implementation that retains the 100 most recently used values. - const urlParser = new URL(url); - const robotsUrl = `${urlParser.origin}/robots.txt`; - - const cachedRobots = await this.crawlState.getCachedRobots(robotsUrl); - if (cachedRobots) { - logger.debug( - "Using cached robots.txt body", - { - url: robotsUrl, - ...logDetails, - }, - "robots", - ); - return robotsParser(robotsUrl, cachedRobots); - } - - try { - logger.debug( - "Fetching robots.txt", - { url: robotsUrl, ...logDetails }, - "robots", - ); - const resp = await this._fetchRobots(robotsUrl); - if (!resp) { - return null; - } - const content = await resp.text(); - - logger.debug( - "Caching robots.txt body", - { url: robotsUrl, ...logDetails }, - "robots", - ); - await this.crawlState.setCachedRobots(robotsUrl, content); - - return robotsParser(robotsUrl, content); - } catch (e) { - // ignore - } - logger.warn( - "Failed to fetch robots.txt", - { - url: robotsUrl, - ...logDetails, - }, - "robots", - ); - return null; - } - async awaitPageExtraDelay(opts: WorkerState) { if (this.params.pageExtraDelay) { const { @@ -2599,16 +2511,16 @@ self.__bx_behaviors.selectMainBehavior(); return false; } - if (this.params.robots) { - const robots = await this.fetchAndParseRobots(url, logDetails); - if (robots && robots.isDisallowed(url, "Browsertrix/1.0")) { - logger.debug( - "Page URL not queued, disallowed by robots.txt", - { url, ...logDetails }, - "links", - ); - return false; - } + if ( + this.params.robots && + (await isDisallowedByRobots(url, logDetails, this.params.robotsAgent)) + ) { + logger.debug( + "Page URL not queued, disallowed by robots.txt", + { url, ...logDetails }, + "links", + ); + return false; } const result = await this.crawlState.addToQueue( diff --git a/src/util/argParser.ts b/src/util/argParser.ts index e84a2a5e4..f5d6f9733 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -711,6 +711,12 @@ class ArgParser { type: "boolean", default: false, }, + + robotsAgent: { + describe: "Agent to check in addition to '*' for robots rules", + type: "string", + default: "Browsertrix/1.x", + }, }); } diff --git a/src/util/robots.ts b/src/util/robots.ts new file mode 100644 index 000000000..f7f3abe68 --- /dev/null +++ b/src/util/robots.ts @@ -0,0 +1,116 @@ +import { fetch } from "undici"; +import robotsParser, { Robot } from "robots-parser"; + +import { LogDetails, logger } from "./logger.js"; +import { RedisCrawlState } from "./state.js"; +import { getProxyDispatcher } from "./proxy.js"; +import { sleep } from "./timing.js"; + +let headers: Record = {}; +let crawlState: RedisCrawlState | null = null; + +export function setRobotsConfig( + _headers: Record, + state: RedisCrawlState, +) { + headers = _headers; + crawlState = state; +} + +export async function isDisallowedByRobots( + url: string, + logDetails: LogDetails, + robotsAgent: string, +) { + const robots = await fetchAndParseRobots(url, logDetails); + return robots && robots.isDisallowed(url, robotsAgent); +} + +async function fetchAndParseRobots( + url: string, + logDetails: LogDetails, +): Promise { + // Fetch robots.txt for url's host and return parser. + // Results are cached by robots.txt URL in Redis using an LRU cache + // implementation that retains the 100 most recently used values. + const urlParser = new URL(url); + const robotsUrl = `${urlParser.origin}/robots.txt`; + + const cachedRobots = await crawlState!.getCachedRobots(robotsUrl); + // empty string is valid cached empty robots, so check for null + if (cachedRobots !== null) { + // don't create parser, just skip check if empty string + return cachedRobots ? robotsParser(robotsUrl, cachedRobots) : null; + } + + try { + logger.debug( + "Fetching robots.txt", + { url: robotsUrl, ...logDetails }, + "robots", + ); + const content = await fetchRobots(robotsUrl); + if (content === null) { + return null; + } + + logger.debug( + "Caching robots.txt body", + { url: robotsUrl, ...logDetails }, + "robots", + ); + await crawlState!.setCachedRobots(robotsUrl, content); + + // empty string cached, but no need to create parser + return content ? robotsParser(robotsUrl, content) : null; + } catch (e) { + // ignore + } + logger.warn( + "Failed to fetch robots.txt", + { + url: robotsUrl, + ...logDetails, + }, + "robots", + ); + return null; +} + +async function fetchRobots(url: string): Promise { + while (true) { + const resp = await fetch(url, { + headers, + dispatcher: getProxyDispatcher(url), + }); + + if (resp.ok) { + return await resp.text(); + } + + if (resp.status === 429 || resp.status === 503) { + const retry = resp.headers.get("retry-after"); + + if (retry) { + logger.debug( + "Robots.txt fetch: Retry after", + { url, retrySeconds: retry }, + "robots", + ); + await sleep(parseInt(retry)); + continue; + } + + logger.debug( + "Robots.txt not fetched, will retry later", + { url, status: resp.status }, + "robots", + ); + + return null; + } + + // for other status errors, just return empty + return ""; + } +} From 787a1d2aa1a500d213798a4f8542427036326e99 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 25 Nov 2025 22:52:15 -0800 Subject: [PATCH 6/9] log when invalid code is stored as empty, fix test --- src/util/robots.ts | 8 +++++++- tests/robots_txt.test.js | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/util/robots.ts b/src/util/robots.ts index f7f3abe68..93d9c152c 100644 --- a/src/util/robots.ts +++ b/src/util/robots.ts @@ -102,7 +102,7 @@ async function fetchRobots(url: string): Promise { } logger.debug( - "Robots.txt not fetched, will retry later", + "Robots.txt temporarily not fetched, will retry later", { url, status: resp.status }, "robots", ); @@ -110,6 +110,12 @@ async function fetchRobots(url: string): Promise { return null; } + logger.debug( + "Robots.txt invalid, storing empty value", + { url, status: resp.status }, + "robots", + ); + // for other status errors, just return empty return ""; } diff --git a/tests/robots_txt.test.js b/tests/robots_txt.test.js index 43ffe1975..a181d016f 100644 --- a/tests/robots_txt.test.js +++ b/tests/robots_txt.test.js @@ -16,7 +16,7 @@ test("test robots.txt is fetched and cached", async () => { expect( log.indexOf( - '"logLevel":"debug","context":"robots","message":"Robots.txt not fetched","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}', + '"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}', ) > 0, ).toBe(true); From f0ace6dd390c0d229c21f37e3a4e539c9d33962f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 26 Nov 2025 17:00:14 -0800 Subject: [PATCH 7/9] remove robots fetch retry: - for now, limit to 10 seconds to fetch robots, if failed, treat same as no robots --- src/util/robots.ts | 66 ++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 40 deletions(-) diff --git a/src/util/robots.ts b/src/util/robots.ts index 93d9c152c..b2076632f 100644 --- a/src/util/robots.ts +++ b/src/util/robots.ts @@ -4,11 +4,14 @@ import robotsParser, { Robot } from "robots-parser"; import { LogDetails, logger } from "./logger.js"; import { RedisCrawlState } from "./state.js"; import { getProxyDispatcher } from "./proxy.js"; -import { sleep } from "./timing.js"; +import { timedRun } from "./timing.js"; let headers: Record = {}; let crawlState: RedisCrawlState | null = null; +// max seconds to wait to fetch robots +const ROBOTS_FETCH_TIMEOUT = 10; + export function setRobotsConfig( _headers: Record, state: RedisCrawlState, @@ -49,7 +52,14 @@ async function fetchAndParseRobots( { url: robotsUrl, ...logDetails }, "robots", ); - const content = await fetchRobots(robotsUrl); + const content = await timedRun( + fetchRobots(robotsUrl), + ROBOTS_FETCH_TIMEOUT, + "Fetching Robots timed out", + logDetails, + "robots", + ); + if (content === null) { return null; } @@ -78,45 +88,21 @@ async function fetchAndParseRobots( } async function fetchRobots(url: string): Promise { - while (true) { - const resp = await fetch(url, { - headers, - dispatcher: getProxyDispatcher(url), - }); - - if (resp.ok) { - return await resp.text(); - } - - if (resp.status === 429 || resp.status === 503) { - const retry = resp.headers.get("retry-after"); - - if (retry) { - logger.debug( - "Robots.txt fetch: Retry after", - { url, retrySeconds: retry }, - "robots", - ); - await sleep(parseInt(retry)); - continue; - } - - logger.debug( - "Robots.txt temporarily not fetched, will retry later", - { url, status: resp.status }, - "robots", - ); + const resp = await fetch(url, { + headers, + dispatcher: getProxyDispatcher(url), + }); - return null; - } + if (resp.ok) { + return await resp.text(); + } - logger.debug( - "Robots.txt invalid, storing empty value", - { url, status: resp.status }, - "robots", - ); + logger.debug( + "Robots.txt invalid, storing empty value", + { url, status: resp.status }, + "robots", + ); - // for other status errors, just return empty - return ""; - } + // for other status errors, just return empty + return ""; } From 0b9feddb27033477595a9950451654158f5b6613 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 26 Nov 2025 17:50:52 -0800 Subject: [PATCH 8/9] batch multiple fetches to same URL locally limit size of robots.txt to 100K --- src/util/robots.ts | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/src/util/robots.ts b/src/util/robots.ts index b2076632f..a6db043d3 100644 --- a/src/util/robots.ts +++ b/src/util/robots.ts @@ -9,6 +9,11 @@ import { timedRun } from "./timing.js"; let headers: Record = {}; let crawlState: RedisCrawlState | null = null; +const pendingFetches: Map> = new Map< + string, + Promise +>(); + // max seconds to wait to fetch robots const ROBOTS_FETCH_TIMEOUT = 10; @@ -47,18 +52,20 @@ async function fetchAndParseRobots( } try { - logger.debug( - "Fetching robots.txt", - { url: robotsUrl, ...logDetails }, - "robots", - ); - const content = await timedRun( - fetchRobots(robotsUrl), - ROBOTS_FETCH_TIMEOUT, - "Fetching Robots timed out", - logDetails, - "robots", - ); + let promise = pendingFetches.get(robotsUrl); + + if (!promise) { + promise = timedRun( + fetchRobots(robotsUrl, logDetails), + ROBOTS_FETCH_TIMEOUT, + "Fetching Robots timed out", + logDetails, + "robots", + ); + pendingFetches.set(robotsUrl, promise); + } + + const content = await promise; if (content === null) { return null; @@ -75,6 +82,8 @@ async function fetchAndParseRobots( return content ? robotsParser(robotsUrl, content) : null; } catch (e) { // ignore + } finally { + pendingFetches.delete(robotsUrl); } logger.warn( "Failed to fetch robots.txt", @@ -87,14 +96,21 @@ async function fetchAndParseRobots( return null; } -async function fetchRobots(url: string): Promise { +async function fetchRobots( + url: string, + logDetails: LogDetails, +): Promise { + logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots"); + const resp = await fetch(url, { headers, dispatcher: getProxyDispatcher(url), }); if (resp.ok) { - return await resp.text(); + const buff = await resp.arrayBuffer(); + // only decode and store at most 100K + return new TextDecoder().decode(buff.slice(0, 100000)); } logger.debug( From b54d1d1bba017095d78601e2358a86fd8fe42bbc Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 26 Nov 2025 18:57:47 -0800 Subject: [PATCH 9/9] docs: update cli options for robots --- docs/docs/user-guide/cli-options.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/docs/user-guide/cli-options.md b/docs/docs/user-guide/cli-options.md index 9a438c04b..d819318e8 100644 --- a/docs/docs/user-guide/cli-options.md +++ b/docs/docs/user-guide/cli-options.md @@ -103,16 +103,16 @@ Options: , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt - atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"] - [default: []] + atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope", + "robots"] [default: []] --logExcludeContext Comma-separated list of contexts to NOT include in logs [array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer" , "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast ", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt - atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"] - [default: ["recorderNetwork","jsError","screencast"]] + atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope", + "robots"] [default: ["recorderNetwork","jsError","screencast"]] --text Extract initial (default) or final t ext to pages.jsonl or WARC resource record(s) @@ -324,6 +324,12 @@ Options: the Chrome instance (space-separated or multiple --extraChromeArgs) [array] [default: []] + --robots If set, fetch and respect page disal + lows specified in per-host robots.tx + t [boolean] [default: false] + --robotsAgent Agent to check in addition to '*' fo + r robots rules + [string] [default: "Browsertrix/1.x"] --config Path to YAML config file ```