-
-
Notifications
You must be signed in to change notification settings - Fork 123
Add option to respect robots.txt disallows #888
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
a6bc35d
Add CLI option to respect robots.txt disallows
tw4l 547d645
Implement LRU cache with limit of 100 robots.txt bodies
tw4l 0a3ef30
Add debug log line for deleting cached robots
tw4l ae81f47
Add tests for robots.txt being fetched and cached
tw4l e9e3738
minor cleanup:
ikreymer 787a1d2
log when invalid code is stored as empty, fix test
ikreymer f0ace6d
remove robots fetch retry:
ikreymer 0b9fedd
batch multiple fetches to same URL locally
ikreymer b54d1d1
docs: update cli options for robots
ikreymer File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,124 @@ | ||
| import { fetch } from "undici"; | ||
| import robotsParser, { Robot } from "robots-parser"; | ||
|
|
||
| import { LogDetails, logger } from "./logger.js"; | ||
| import { RedisCrawlState } from "./state.js"; | ||
| import { getProxyDispatcher } from "./proxy.js"; | ||
| import { timedRun } from "./timing.js"; | ||
|
|
||
| let headers: Record<string, string> = {}; | ||
| let crawlState: RedisCrawlState | null = null; | ||
|
|
||
| const pendingFetches: Map<string, Promise<string>> = new Map< | ||
| string, | ||
| Promise<string> | ||
| >(); | ||
|
|
||
| // max seconds to wait to fetch robots | ||
| const ROBOTS_FETCH_TIMEOUT = 10; | ||
|
|
||
| export function setRobotsConfig( | ||
| _headers: Record<string, string>, | ||
| state: RedisCrawlState, | ||
| ) { | ||
| headers = _headers; | ||
| crawlState = state; | ||
| } | ||
|
|
||
| export async function isDisallowedByRobots( | ||
| url: string, | ||
| logDetails: LogDetails, | ||
| robotsAgent: string, | ||
| ) { | ||
| const robots = await fetchAndParseRobots(url, logDetails); | ||
| return robots && robots.isDisallowed(url, robotsAgent); | ||
| } | ||
|
|
||
| async function fetchAndParseRobots( | ||
| url: string, | ||
| logDetails: LogDetails, | ||
| ): Promise<Robot | null> { | ||
| // Fetch robots.txt for url's host and return parser. | ||
| // Results are cached by robots.txt URL in Redis using an LRU cache | ||
| // implementation that retains the 100 most recently used values. | ||
| const urlParser = new URL(url); | ||
| const robotsUrl = `${urlParser.origin}/robots.txt`; | ||
|
|
||
| const cachedRobots = await crawlState!.getCachedRobots(robotsUrl); | ||
| // empty string is valid cached empty robots, so check for null | ||
| if (cachedRobots !== null) { | ||
| // don't create parser, just skip check if empty string | ||
| return cachedRobots ? robotsParser(robotsUrl, cachedRobots) : null; | ||
| } | ||
|
|
||
| try { | ||
| let promise = pendingFetches.get(robotsUrl); | ||
|
|
||
| if (!promise) { | ||
| promise = timedRun( | ||
| fetchRobots(robotsUrl, logDetails), | ||
| ROBOTS_FETCH_TIMEOUT, | ||
| "Fetching Robots timed out", | ||
| logDetails, | ||
| "robots", | ||
| ); | ||
| pendingFetches.set(robotsUrl, promise); | ||
| } | ||
|
|
||
| const content = await promise; | ||
|
|
||
| if (content === null) { | ||
| return null; | ||
| } | ||
|
|
||
| logger.debug( | ||
| "Caching robots.txt body", | ||
| { url: robotsUrl, ...logDetails }, | ||
| "robots", | ||
| ); | ||
| await crawlState!.setCachedRobots(robotsUrl, content); | ||
|
|
||
| // empty string cached, but no need to create parser | ||
| return content ? robotsParser(robotsUrl, content) : null; | ||
| } catch (e) { | ||
| // ignore | ||
| } finally { | ||
| pendingFetches.delete(robotsUrl); | ||
| } | ||
| logger.warn( | ||
| "Failed to fetch robots.txt", | ||
| { | ||
| url: robotsUrl, | ||
| ...logDetails, | ||
| }, | ||
| "robots", | ||
| ); | ||
| return null; | ||
| } | ||
|
|
||
| async function fetchRobots( | ||
| url: string, | ||
| logDetails: LogDetails, | ||
| ): Promise<string | null> { | ||
| logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots"); | ||
|
|
||
| const resp = await fetch(url, { | ||
| headers, | ||
| dispatcher: getProxyDispatcher(url), | ||
| }); | ||
|
|
||
| if (resp.ok) { | ||
| const buff = await resp.arrayBuffer(); | ||
| // only decode and store at most 100K | ||
| return new TextDecoder().decode(buff.slice(0, 100000)); | ||
| } | ||
|
|
||
| logger.debug( | ||
| "Robots.txt invalid, storing empty value", | ||
| { url, status: resp.status }, | ||
| "robots", | ||
| ); | ||
|
|
||
| // for other status errors, just return empty | ||
| return ""; | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,35 @@ | ||
| import child_process from "child_process"; | ||
|
|
||
| test("test robots.txt is fetched and cached", async () => { | ||
| const res = child_process.execSync( | ||
| "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug", | ||
| ); | ||
|
|
||
| const log = res.toString(); | ||
|
|
||
| // robots.txt not found | ||
| expect( | ||
| log.indexOf( | ||
| '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}', | ||
| ) > 0, | ||
| ).toBe(true); | ||
|
|
||
| expect( | ||
| log.indexOf( | ||
| '"logLevel":"debug","context":"robots","message":"Robots.txt invalid, storing empty value","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}', | ||
| ) > 0, | ||
| ).toBe(true); | ||
|
|
||
| // robots.txt found and cached | ||
| expect( | ||
| log.indexOf( | ||
| '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}', | ||
| ) > 0, | ||
| ).toBe(true); | ||
|
|
||
| expect( | ||
| log.indexOf( | ||
| '"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}', | ||
| ) > 0, | ||
| ).toBe(true); | ||
| }); |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.