Skip to content

Commit 1d15a15

Browse files
tw4likreymer
andauthored
Add option to respect robots.txt disallows (#888)
Fixes #631 - Adds --robots flag which will enable checking robots.txt for each host for each page, before the page is queued for further crawler. - Supports --robotsAgent flag which configures agent to check in robots.txt, in addition to '*'. Defaults to 'Browsertrix/1.x' - Robots.txt bodies are parsed and checked for page allow/disallow status using the https://github.com/samclarke/robots-parser library, which is the most active and well-maintained implementation I could find with TypeScript types. - Fetched robots.txt bodies are cached by their URL in Redis using an LRU, retaining last 100 robots entries, each upto 100K - Non-200 responses are treated as empty robots, and empty robots are treated as 'allow all' - Multiple request to same robots.txt are batched to perform only one fetch, waiting up to 10 seconds per fetch. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
1 parent 75a0c9a commit 1d15a15

File tree

9 files changed

+247
-5
lines changed

9 files changed

+247
-5
lines changed

docs/docs/user-guide/cli-options.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,16 +103,16 @@ Options:
103103
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
104104
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
105105
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
106-
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
107-
[default: []]
106+
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
107+
"robots"] [default: []]
108108
--logExcludeContext Comma-separated list of contexts to
109109
NOT include in logs
110110
[array] [choices: "general", "worker", "recorder", "recorderNetwork", "writer"
111111
, "state", "redis", "storage", "text", "exclusion", "screenshots", "screencast
112112
", "originOverride", "healthcheck", "browser", "blocking", "behavior", "behavi
113113
orScript", "behaviorScriptCustom", "jsError", "fetch", "pageStatus", "memorySt
114-
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope"]
115-
[default: ["recorderNetwork","jsError","screencast"]]
114+
atus", "crawlStatus", "links", "sitemap", "wacz", "replay", "proxy", "scope",
115+
"robots"] [default: ["recorderNetwork","jsError","screencast"]]
116116
--text Extract initial (default) or final t
117117
ext to pages.jsonl or WARC resource
118118
record(s)
@@ -324,6 +324,12 @@ Options:
324324
the Chrome instance (space-separated
325325
or multiple --extraChromeArgs)
326326
[array] [default: []]
327+
--robots If set, fetch and respect page disal
328+
lows specified in per-host robots.tx
329+
t [boolean] [default: false]
330+
--robotsAgent Agent to check in addition to '*' fo
331+
r robots rules
332+
[string] [default: "Browsertrix/1.x"]
327333
--config Path to YAML config file
328334
```
329335

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
"pixelmatch": "^5.3.0",
3535
"pngjs": "^7.0.0",
3636
"puppeteer-core": "^24.30.0",
37+
"robots-parser": "^3.0.1",
3738
"sax": "^1.3.0",
3839
"sharp": "^0.32.6",
3940
"tsc": "^2.0.4",

src/crawler.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ import {
7272
import { isHTMLMime, isRedirectStatus } from "./util/reqresp.js";
7373
import { initProxy } from "./util/proxy.js";
7474
import { initFlow, nextFlowStep } from "./util/flowbehavior.js";
75+
import { isDisallowedByRobots, setRobotsConfig } from "./util/robots.js";
7576

7677
const btrixBehaviors = fs.readFileSync(
7778
new URL(
@@ -547,6 +548,10 @@ export class Crawler {
547548

548549
this.headers = { "User-Agent": this.configureUA() };
549550

551+
if (this.params.robots) {
552+
setRobotsConfig(this.headers, this.crawlState);
553+
}
554+
550555
process.on("exit", () => {
551556
for (const proc of subprocesses) {
552557
proc.kill();
@@ -2506,6 +2511,18 @@ self.__bx_behaviors.selectMainBehavior();
25062511
return false;
25072512
}
25082513

2514+
if (
2515+
this.params.robots &&
2516+
(await isDisallowedByRobots(url, logDetails, this.params.robotsAgent))
2517+
) {
2518+
logger.debug(
2519+
"Page URL not queued, disallowed by robots.txt",
2520+
{ url, ...logDetails },
2521+
"links",
2522+
);
2523+
return false;
2524+
}
2525+
25092526
const result = await this.crawlState.addToQueue(
25102527
{ url, seedId, depth, extraHops, ts, pageid },
25112528
this.pageLimit,

src/util/argParser.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -704,6 +704,19 @@ class ArgParser {
704704
type: "array",
705705
default: [],
706706
},
707+
708+
robots: {
709+
describe:
710+
"If set, fetch and respect page disallows specified in per-host robots.txt",
711+
type: "boolean",
712+
default: false,
713+
},
714+
715+
robotsAgent: {
716+
describe: "Agent to check in addition to '*' for robots rules",
717+
type: "string",
718+
default: "Browsertrix/1.x",
719+
},
707720
});
708721
}
709722

src/util/constants.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30;
4141
export const PAGE_OP_TIMEOUT_SECS = 5;
4242
export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
4343

44+
export const ROBOTS_CACHE_LIMIT = 100;
45+
4446
export type ExtractSelector = {
4547
selector: string;
4648
extract: string;

src/util/logger.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [
5757
"replay",
5858
"proxy",
5959
"scope",
60+
"robots",
6061
] as const;
6162

6263
export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];

src/util/robots.ts

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import { fetch } from "undici";
2+
import robotsParser, { Robot } from "robots-parser";
3+
4+
import { LogDetails, logger } from "./logger.js";
5+
import { RedisCrawlState } from "./state.js";
6+
import { getProxyDispatcher } from "./proxy.js";
7+
import { timedRun } from "./timing.js";
8+
9+
let headers: Record<string, string> = {};
10+
let crawlState: RedisCrawlState | null = null;
11+
12+
const pendingFetches: Map<string, Promise<string>> = new Map<
13+
string,
14+
Promise<string>
15+
>();
16+
17+
// max seconds to wait to fetch robots
18+
const ROBOTS_FETCH_TIMEOUT = 10;
19+
20+
export function setRobotsConfig(
21+
_headers: Record<string, string>,
22+
state: RedisCrawlState,
23+
) {
24+
headers = _headers;
25+
crawlState = state;
26+
}
27+
28+
export async function isDisallowedByRobots(
29+
url: string,
30+
logDetails: LogDetails,
31+
robotsAgent: string,
32+
) {
33+
const robots = await fetchAndParseRobots(url, logDetails);
34+
return robots && robots.isDisallowed(url, robotsAgent);
35+
}
36+
37+
async function fetchAndParseRobots(
38+
url: string,
39+
logDetails: LogDetails,
40+
): Promise<Robot | null> {
41+
// Fetch robots.txt for url's host and return parser.
42+
// Results are cached by robots.txt URL in Redis using an LRU cache
43+
// implementation that retains the 100 most recently used values.
44+
const urlParser = new URL(url);
45+
const robotsUrl = `${urlParser.origin}/robots.txt`;
46+
47+
const cachedRobots = await crawlState!.getCachedRobots(robotsUrl);
48+
// empty string is valid cached empty robots, so check for null
49+
if (cachedRobots !== null) {
50+
// don't create parser, just skip check if empty string
51+
return cachedRobots ? robotsParser(robotsUrl, cachedRobots) : null;
52+
}
53+
54+
try {
55+
let promise = pendingFetches.get(robotsUrl);
56+
57+
if (!promise) {
58+
promise = timedRun(
59+
fetchRobots(robotsUrl, logDetails),
60+
ROBOTS_FETCH_TIMEOUT,
61+
"Fetching Robots timed out",
62+
logDetails,
63+
"robots",
64+
);
65+
pendingFetches.set(robotsUrl, promise);
66+
}
67+
68+
const content = await promise;
69+
70+
if (content === null) {
71+
return null;
72+
}
73+
74+
logger.debug(
75+
"Caching robots.txt body",
76+
{ url: robotsUrl, ...logDetails },
77+
"robots",
78+
);
79+
await crawlState!.setCachedRobots(robotsUrl, content);
80+
81+
// empty string cached, but no need to create parser
82+
return content ? robotsParser(robotsUrl, content) : null;
83+
} catch (e) {
84+
// ignore
85+
} finally {
86+
pendingFetches.delete(robotsUrl);
87+
}
88+
logger.warn(
89+
"Failed to fetch robots.txt",
90+
{
91+
url: robotsUrl,
92+
...logDetails,
93+
},
94+
"robots",
95+
);
96+
return null;
97+
}
98+
99+
async function fetchRobots(
100+
url: string,
101+
logDetails: LogDetails,
102+
): Promise<string | null> {
103+
logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots");
104+
105+
const resp = await fetch(url, {
106+
headers,
107+
dispatcher: getProxyDispatcher(url),
108+
});
109+
110+
if (resp.ok) {
111+
const buff = await resp.arrayBuffer();
112+
// only decode and store at most 100K
113+
return new TextDecoder().decode(buff.slice(0, 100000));
114+
}
115+
116+
logger.debug(
117+
"Robots.txt invalid, storing empty value",
118+
{ url, status: resp.status },
119+
"robots",
120+
);
121+
122+
// for other status errors, just return empty
123+
return "";
124+
}

src/util/state.ts

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@ import { v4 as uuidv4 } from "uuid";
33

44
import { logger } from "./logger.js";
55

6-
import { MAX_DEPTH, DEFAULT_MAX_RETRIES } from "./constants.js";
6+
import {
7+
MAX_DEPTH,
8+
DEFAULT_MAX_RETRIES,
9+
ROBOTS_CACHE_LIMIT,
10+
} from "./constants.js";
711
import { ScopedSeed } from "./seeds.js";
812
import { Frame } from "puppeteer-core";
913
import { interpolateFilename, UploadResult } from "./storage.js";
@@ -200,7 +204,10 @@ export class RedisCrawlState {
200204
fkey: string;
201205
ekey: string;
202206
bkey: string;
207+
rkey: string;
208+
lkey: string;
203209
pageskey: string;
210+
204211
esKey: string;
205212
esMap: string;
206213

@@ -233,6 +240,10 @@ export class RedisCrawlState {
233240
this.ekey = this.key + ":e";
234241
// crawler behavior script messages
235242
this.bkey = this.key + ":b";
243+
// cached robots.txt bodies (per-origin)
244+
this.rkey = this.key + ":r";
245+
// LRU cache of robots.txt keys
246+
this.lkey = this.key + ":l";
236247
// pages
237248
this.pageskey = this.key + ":pages";
238249

@@ -1025,6 +1036,38 @@ return inx;
10251036
return await this.redis.lpush(this.bkey, behaviorLog);
10261037
}
10271038

1039+
async _updateRobotsAccessTime(robotsUrl: string) {
1040+
const accessTime = Date.now();
1041+
await this.redis.zadd(this.lkey, accessTime, robotsUrl);
1042+
}
1043+
1044+
async setCachedRobots(robotsUrl: string, body: string) {
1045+
await this._updateRobotsAccessTime(robotsUrl);
1046+
await this.redis.set(`${this.rkey}:${robotsUrl}`, body);
1047+
1048+
// prune least-recently used items in zset and robots cache if over limit
1049+
const cacheCount = await this.redis.zcard(this.lkey);
1050+
if (cacheCount > ROBOTS_CACHE_LIMIT) {
1051+
const diff = cacheCount - ROBOTS_CACHE_LIMIT;
1052+
const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1);
1053+
1054+
for (const keyToDelete of keysToDelete) {
1055+
logger.debug(
1056+
"Deleting cached robots.txt, over cache limit",
1057+
{ url: keyToDelete },
1058+
"robots",
1059+
);
1060+
await this.redis.del(`${this.rkey}:${keyToDelete}`);
1061+
await this.redis.zrem(this.lkey, keyToDelete);
1062+
}
1063+
}
1064+
}
1065+
1066+
async getCachedRobots(robotsUrl: string) {
1067+
await this._updateRobotsAccessTime(robotsUrl);
1068+
return await this.redis.get(`${this.rkey}:${robotsUrl}`);
1069+
}
1070+
10281071
async writeToPagesQueue(
10291072
data: Record<string, string | number | boolean | object>,
10301073
) {

tests/robots_txt.test.js

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import child_process from "child_process";
2+
3+
test("test robots.txt is fetched and cached", async () => {
4+
const res = child_process.execSync(
5+
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug",
6+
);
7+
8+
const log = res.toString();
9+
10+
// robots.txt not found
11+
expect(
12+
log.indexOf(
13+
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}',
14+
) > 0,
15+
).toBe(true);
16+
17+
expect(
18+
log.indexOf(
19+
'"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}',
20+
) > 0,
21+
).toBe(true);
22+
23+
// robots.txt found and cached
24+
expect(
25+
log.indexOf(
26+
'"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}',
27+
) > 0,
28+
).toBe(true);
29+
30+
expect(
31+
log.indexOf(
32+
'"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}',
33+
) > 0,
34+
).toBe(true);
35+
});

0 commit comments

Comments
 (0)