Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions docs/docs/user-guide/cli-options.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,16 @@ Options:
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
[default: []]
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
"robots"] [default: []]
--logExcludeContext Comma-separated list of contexts to
NOT include in logs
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
[default: ["recorderNetwork","jsError","screencast"]]
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
"robots"] [default: ["recorderNetwork","jsError","screencast"]]
--text Extract initial (default) or final t
ext to pages.jsonl or WARC resource
record(s)
Expand Down Expand Up @@ -324,6 +324,12 @@ Options:
the Chrome instance (space-separated
or multiple --extraChromeArgs)
[array] [default: []]
--robots If set, fetch and respect page disal
lows specified in per-host robots.tx
t [boolean] [default: false]
--robotsAgent Agent to check in addition to '*' fo
r robots rules
[string] [default: "Browsertrix/1.x"]
--config Path to YAML config file
```

Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"pixelmatch": "^5.3.0",
"pngjs": "^7.0.0",
"puppeteer-core": "^24.30.0",
"robots-parser": "^3.0.1",
"sax": "^1.3.0",
"sharp": "^0.32.6",
"tsc": "^2.0.4",
Expand Down
17 changes: 17 additions & 0 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ import {
import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
import { initProxy } from "./util/proxy.js";
import { initFlow, nextFlowStep } from "./util/flowbehavior.js";
import { isDisallowedByRobots, setRobotsConfig } from "./util/robots.js";

const btrixBehaviors = fs.readFileSync(
new URL(
Expand Down Expand Up @@ -547,6 +548,10 @@ export class Crawler {

this.headers = { "User-Agent": this.configureUA() };

if (this.params.robots) {
setRobotsConfig(this.headers, this.crawlState);
}

process.on("exit", () => {
for (const proc of subprocesses) {
proc.kill();
Expand Down Expand Up @@ -2506,6 +2511,18 @@ self.__bx_behaviors.selectMainBehavior();
return false;
}

if (
this.params.robots &&
(await isDisallowedByRobots(url, logDetails, this.params.robotsAgent))
) {
logger.debug(
"Page URL not queued, disallowed by robots.txt",
{ url, ...logDetails },
"links",
);
return false;
}

const result = await this.crawlState.addToQueue(
{ url, seedId, depth, extraHops, ts, pageid },
this.pageLimit,
Expand Down
13 changes: 13 additions & 0 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,19 @@ class ArgParser {
type: "array",
default: [],
},

robots: {
describe:
"If set, fetch and respect page disallows specified in per-host robots.txt",
type: "boolean",
default: false,
},

robotsAgent: {
describe: "Agent to check in addition to '*' for robots rules",
type: "string",
default: "Browsertrix/1.x",
},
});
}

Expand Down
2 changes: 2 additions & 0 deletions src/util/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30;
export const PAGE_OP_TIMEOUT_SECS = 5;
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;

export const ROBOTS_CACHE_LIMIT = 100;

export type ExtractSelector = {
selector: string;
extract: string;
Expand Down
1 change: 1 addition & 0 deletions src/util/logger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [
"replay",
"proxy",
"scope",
"robots",
] as const;

export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
Expand Down
124 changes: 124 additions & 0 deletions src/util/robots.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import { fetch } from "undici";
import robotsParser, { Robot } from "robots-parser";

import { LogDetails, logger } from "./logger.js";
import { RedisCrawlState } from "./state.js";
import { getProxyDispatcher } from "./proxy.js";
import { timedRun } from "./timing.js";

let headers: Record<string, string> = {};
let crawlState: RedisCrawlState | null = null;

const pendingFetches: Map<string, Promise<string>> = new Map<
string,
Promise<string>
>();

// max seconds to wait to fetch robots
const ROBOTS_FETCH_TIMEOUT = 10;

export function setRobotsConfig(
_headers: Record<string, string>,
state: RedisCrawlState,
) {
headers = _headers;
crawlState = state;
}

export async function isDisallowedByRobots(
url: string,
logDetails: LogDetails,
robotsAgent: string,
) {
const robots = await fetchAndParseRobots(url, logDetails);
return robots && robots.isDisallowed(url, robotsAgent);
}

async function fetchAndParseRobots(
url: string,
logDetails: LogDetails,
): Promise<Robot | null> {
// Fetch robots.txt for url's host and return parser.
// Results are cached by robots.txt URL in Redis using an LRU cache
// implementation that retains the 100 most recently used values.
const urlParser = new URL(url);
const robotsUrl = `${urlParser.origin}/robots.txt`;

const cachedRobots = await crawlState!.getCachedRobots(robotsUrl);
// empty string is valid cached empty robots, so check for null
if (cachedRobots !== null) {
// don't create parser, just skip check if empty string
return cachedRobots ? robotsParser(robotsUrl, cachedRobots) : null;
}

try {
let promise = pendingFetches.get(robotsUrl);

if (!promise) {
promise = timedRun(
fetchRobots(robotsUrl, logDetails),
ROBOTS_FETCH_TIMEOUT,
"Fetching Robots timed out",
logDetails,
"robots",
);
pendingFetches.set(robotsUrl, promise);
}

const content = await promise;

if (content === null) {
return null;
}

logger.debug(
"Caching robots.txt body",
{ url: robotsUrl, ...logDetails },
"robots",
);
await crawlState!.setCachedRobots(robotsUrl, content);

// empty string cached, but no need to create parser
return content ? robotsParser(robotsUrl, content) : null;
} catch (e) {
// ignore
} finally {
pendingFetches.delete(robotsUrl);
}
logger.warn(
"Failed to fetch robots.txt",
{
url: robotsUrl,
...logDetails,
},
"robots",
);
return null;
}

async function fetchRobots(
url: string,
logDetails: LogDetails,
): Promise<string | null> {
logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots");

const resp = await fetch(url, {
headers,
dispatcher: getProxyDispatcher(url),
});

if (resp.ok) {
const buff = await resp.arrayBuffer();
// only decode and store at most 100K
return new TextDecoder().decode(buff.slice(0, 100000));
}

logger.debug(
"Robots.txt invalid, storing empty value",
{ url, status: resp.status },
"robots",
);

// for other status errors, just return empty
return "";
}
45 changes: 44 additions & 1 deletion src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ import { v4 as uuidv4 } from "uuid";

import { logger } from "./logger.js";

import { MAX_DEPTH, DEFAULT_MAX_RETRIES } from "./constants.js";
import {
MAX_DEPTH,
DEFAULT_MAX_RETRIES,
ROBOTS_CACHE_LIMIT,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { Frame } from "puppeteer-core";
import { interpolateFilename, UploadResult } from "./storage.js";
Expand Down Expand Up @@ -200,7 +204,10 @@ export class RedisCrawlState {
fkey: string;
ekey: string;
bkey: string;
rkey: string;
lkey: string;
pageskey: string;

esKey: string;
esMap: string;

Expand Down Expand Up @@ -233,6 +240,10 @@ export class RedisCrawlState {
this.ekey = this.key + ":e";
// crawler behavior script messages
this.bkey = this.key + ":b";
// cached robots.txt bodies (per-origin)
this.rkey = this.key + ":r";
// LRU cache of robots.txt keys
this.lkey = this.key + ":l";
// pages
this.pageskey = this.key + ":pages";

Expand Down Expand Up @@ -1025,6 +1036,38 @@ return inx;
return await this.redis.lpush(this.bkey, behaviorLog);
}

async _updateRobotsAccessTime(robotsUrl: string) {
const accessTime = Date.now();
await this.redis.zadd(this.lkey, accessTime, robotsUrl);
}

async setCachedRobots(robotsUrl: string, body: string) {
await this._updateRobotsAccessTime(robotsUrl);
await this.redis.set(`${this.rkey}:${robotsUrl}`, body);

// prune least-recently used items in zset and robots cache if over limit
const cacheCount = await this.redis.zcard(this.lkey);
if (cacheCount > ROBOTS_CACHE_LIMIT) {
const diff = cacheCount - ROBOTS_CACHE_LIMIT;
const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1);

for (const keyToDelete of keysToDelete) {
logger.debug(
"Deleting cached robots.txt, over cache limit",
{ url: keyToDelete },
"robots",
);
await this.redis.del(`${this.rkey}:${keyToDelete}`);
await this.redis.zrem(this.lkey, keyToDelete);
}
}
}

async getCachedRobots(robotsUrl: string) {
await this._updateRobotsAccessTime(robotsUrl);
return await this.redis.get(`${this.rkey}:${robotsUrl}`);
}

async writeToPagesQueue(
data: Record<string, string | number | boolean | object>,
) {
Expand Down
35 changes: 35 additions & 0 deletions tests/robots_txt.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import child_process from "child_process";

test("test robots.txt is fetched and cached", async () => {
const res = child_process.execSync(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug",
);

const log = res.toString();

// robots.txt not found
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);

expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
) > 0,
).toBe(true);

// robots.txt found and cached
expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);

expect(
log.indexOf(
'"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
) > 0,
).toBe(true);
});
Loading