|
| 1 | +import { fetch } from "undici"; |
| 2 | +import robotsParser, { Robot } from "robots-parser"; |
| 3 | + |
| 4 | +import { LogDetails, logger } from "./logger.js"; |
| 5 | +import { RedisCrawlState } from "./state.js"; |
| 6 | +import { getProxyDispatcher } from "./proxy.js"; |
| 7 | +import { timedRun } from "./timing.js"; |
| 8 | + |
| 9 | +let headers: Record<string, string> = {}; |
| 10 | +let crawlState: RedisCrawlState | null = null; |
| 11 | + |
| 12 | +const pendingFetches: Map<string, Promise<string>> = new Map< |
| 13 | + string, |
| 14 | + Promise<string> |
| 15 | +>(); |
| 16 | + |
| 17 | +// max seconds to wait to fetch robots |
| 18 | +const ROBOTS_FETCH_TIMEOUT = 10; |
| 19 | + |
| 20 | +export function setRobotsConfig( |
| 21 | + _headers: Record<string, string>, |
| 22 | + state: RedisCrawlState, |
| 23 | +) { |
| 24 | + headers = _headers; |
| 25 | + crawlState = state; |
| 26 | +} |
| 27 | + |
| 28 | +export async function isDisallowedByRobots( |
| 29 | + url: string, |
| 30 | + logDetails: LogDetails, |
| 31 | + robotsAgent: string, |
| 32 | +) { |
| 33 | + const robots = await fetchAndParseRobots(url, logDetails); |
| 34 | + return robots && robots.isDisallowed(url, robotsAgent); |
| 35 | +} |
| 36 | + |
| 37 | +async function fetchAndParseRobots( |
| 38 | + url: string, |
| 39 | + logDetails: LogDetails, |
| 40 | +): Promise<Robot | null> { |
| 41 | + // Fetch robots.txt for url's host and return parser. |
| 42 | + // Results are cached by robots.txt URL in Redis using an LRU cache |
| 43 | + // implementation that retains the 100 most recently used values. |
| 44 | + const urlParser = new URL(url); |
| 45 | + const robotsUrl = `${urlParser.origin}/robots.txt`; |
| 46 | + |
| 47 | + const cachedRobots = await crawlState!.getCachedRobots(robotsUrl); |
| 48 | + // empty string is valid cached empty robots, so check for null |
| 49 | + if (cachedRobots !== null) { |
| 50 | + // don't create parser, just skip check if empty string |
| 51 | + return cachedRobots ? robotsParser(robotsUrl, cachedRobots) : null; |
| 52 | + } |
| 53 | + |
| 54 | + try { |
| 55 | + let promise = pendingFetches.get(robotsUrl); |
| 56 | + |
| 57 | + if (!promise) { |
| 58 | + promise = timedRun( |
| 59 | + fetchRobots(robotsUrl, logDetails), |
| 60 | + ROBOTS_FETCH_TIMEOUT, |
| 61 | + "Fetching Robots timed out", |
| 62 | + logDetails, |
| 63 | + "robots", |
| 64 | + ); |
| 65 | + pendingFetches.set(robotsUrl, promise); |
| 66 | + } |
| 67 | + |
| 68 | + const content = await promise; |
| 69 | + |
| 70 | + if (content === null) { |
| 71 | + return null; |
| 72 | + } |
| 73 | + |
| 74 | + logger.debug( |
| 75 | + "Caching robots.txt body", |
| 76 | + { url: robotsUrl, ...logDetails }, |
| 77 | + "robots", |
| 78 | + ); |
| 79 | + await crawlState!.setCachedRobots(robotsUrl, content); |
| 80 | + |
| 81 | + // empty string cached, but no need to create parser |
| 82 | + return content ? robotsParser(robotsUrl, content) : null; |
| 83 | + } catch (e) { |
| 84 | + // ignore |
| 85 | + } finally { |
| 86 | + pendingFetches.delete(robotsUrl); |
| 87 | + } |
| 88 | + logger.warn( |
| 89 | + "Failed to fetch robots.txt", |
| 90 | + { |
| 91 | + url: robotsUrl, |
| 92 | + ...logDetails, |
| 93 | + }, |
| 94 | + "robots", |
| 95 | + ); |
| 96 | + return null; |
| 97 | +} |
| 98 | + |
| 99 | +async function fetchRobots( |
| 100 | + url: string, |
| 101 | + logDetails: LogDetails, |
| 102 | +): Promise<string | null> { |
| 103 | + logger.debug("Fetching robots.txt", { url, ...logDetails }, "robots"); |
| 104 | + |
| 105 | + const resp = await fetch(url, { |
| 106 | + headers, |
| 107 | + dispatcher: getProxyDispatcher(url), |
| 108 | + }); |
| 109 | + |
| 110 | + if (resp.ok) { |
| 111 | + const buff = await resp.arrayBuffer(); |
| 112 | + // only decode and store at most 100K |
| 113 | + return new TextDecoder().decode(buff.slice(0, 100000)); |
| 114 | + } |
| 115 | + |
| 116 | + logger.debug( |
| 117 | + "Robots.txt invalid, storing empty value", |
| 118 | + { url, status: resp.status }, |
| 119 | + "robots", |
| 120 | + ); |
| 121 | + |
| 122 | + // for other status errors, just return empty |
| 123 | + return ""; |
| 124 | +} |
0 commit comments