diff --git a/src/crawler.ts b/src/crawler.ts index 04ecd9b3..8f5680de 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -62,7 +62,8 @@ import { } from "puppeteer-core"; import { Recorder } from "./util/recorder.js"; import { SitemapReader } from "./util/sitemapper.js"; -import { ScopedSeed, parseSeeds } from "./util/seeds.js"; +import { ScopedSeed } from "./util/seeds.js"; +import { parseSeeds } from "./util/parseseeds.js"; import { WARCWriter, createWARCInfo, @@ -513,7 +514,7 @@ export class Crawler { this.proxyServer = res.proxyServer; this.proxyPacUrl = res.proxyPacUrl; - this.seeds = await parseSeeds(this.params); + this.seeds = await parseSeeds(this.params, this.crawlState); this.numOriginalSeeds = this.seeds.length; logger.info("Seeds", this.seeds); @@ -2652,7 +2653,7 @@ self.__bx_behaviors.selectMainBehavior(); } if (await this.crawlState.isSitemapDone()) { - logger.info("Sitemap already processed, skipping", "sitemap"); + logger.info("Sitemap already processed, skipping", {}, "sitemap"); return; } diff --git a/src/util/logger.ts b/src/util/logger.ts index 7d10939e..3d0c7a77 100644 --- a/src/util/logger.ts +++ b/src/util/logger.ts @@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [ "replay", "proxy", "scope", + "seedFile", ] as const; export type LogContext = (typeof LOG_CONTEXT_TYPES)[number]; diff --git a/src/util/parseseeds.ts b/src/util/parseseeds.ts new file mode 100644 index 00000000..2e925dd0 --- /dev/null +++ b/src/util/parseseeds.ts @@ -0,0 +1,134 @@ +import fs from "fs"; + +import { collectOnlineSeedFile } from "./file_reader.js"; +import { logger } from "./logger.js"; +import { type CrawlerArgs } from "./argParser.js"; +import { ScopedSeed, removeQuotes, type ScopeType } from "./seeds.js"; +import { type RedisCrawlState } from "./state.js"; + +export async function parseSeeds( + params: CrawlerArgs, + crawlState?: RedisCrawlState, +): Promise { + let seeds = params.seeds as string[]; + const scopedSeeds: ScopedSeed[] = []; + + // Re-add seedFileDone from serialized state to Redis if present + if (params.state && params.state.seedFileDone && crawlState) { + await crawlState.markSeedFileDone(); + } + + let seedFileDone = false; + if (crawlState) { + seedFileDone = await crawlState.isSeedFileDone(); + } + + // Re-add any seeds from seed files from serialized state to Redis + if ( + params.state && + params.state.seedFileSeeds && + seedFileDone && + crawlState + ) { + for (const seedUrl of params.state.seedFileSeeds) { + await crawlState.addSeedFileSeed(seedUrl); + } + } + + if (params.seedFile && !seedFileDone) { + let seedFilePath = params.seedFile as string; + if ( + seedFilePath.startsWith("http://") || + seedFilePath.startsWith("https://") + ) { + seedFilePath = await collectOnlineSeedFile(seedFilePath); + } + + const urlSeedFile = fs.readFileSync(seedFilePath, "utf8"); + const urlSeedFileList = urlSeedFile.split("\n"); + + if (typeof seeds === "string") { + seeds = [seeds]; + } + + for (const seed of urlSeedFileList) { + if (seed) { + seeds.push(seed); + } + } + } + + const scopeOpts = { + scopeType: params.scopeType as ScopeType | undefined, + sitemap: params.sitemap, + include: params.include, + exclude: params.exclude, + depth: params.depth, + extraHops: params.extraHops, + }; + + for (const seed of seeds) { + const newSeed = typeof seed === "string" ? { url: seed } : seed; + newSeed.url = removeQuotes(newSeed.url); + + try { + const scopedSeed = new ScopedSeed({ ...scopeOpts, ...newSeed }); + scopedSeeds.push(scopedSeed); + if (params.seedFile && !seedFileDone && crawlState) { + await crawlState.addSeedFileSeed(scopedSeed.url); + logger.debug( + "Pushed seed file seed to Redis", + { url: scopedSeed.url }, + "seedFile", + ); + } + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } catch (e: any) { + logger.error("Failed to create seed", { + error: e.toString(), + ...scopeOpts, + ...newSeed, + }); + if (params.failOnFailedSeed) { + logger.fatal( + "Invalid seed specified, aborting crawl", + { url: newSeed.url }, + "general", + 1, + ); + } + } + } + + // If seed file was already successfully parsed, re-add seeds from Redis + if (params.seedFile && seedFileDone && crawlState) { + const seedFileSeedUrls = await crawlState.getSeedFileSeeds(); + for (const seedUrl of seedFileSeedUrls) { + logger.debug( + "Pulled seed file seed from Redis", + { url: seedUrl }, + "seedFile", + ); + try { + const scopedSeed = new ScopedSeed({ ...scopeOpts, url: seedUrl }); + scopedSeeds.push(scopedSeed); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } catch (e: any) { + logger.error("Failed to create seed from Redis", { + error: e.toString(), + url: seedUrl, + }); + } + } + } + + if (!params.qaSource && !scopedSeeds.length) { + logger.fatal("No valid seeds specified, aborting crawl"); + } + + if (params.seedFile && crawlState) { + await crawlState.markSeedFileDone(); + } + + return scopedSeeds; +} diff --git a/src/util/seeds.ts b/src/util/seeds.ts index efdb0b8d..4b02365b 100644 --- a/src/util/seeds.ts +++ b/src/util/seeds.ts @@ -1,11 +1,7 @@ -import fs from "fs"; - import { MAX_DEPTH } from "./constants.js"; -import { collectOnlineSeedFile } from "./file_reader.js"; import { logger } from "./logger.js"; -import { type CrawlerArgs } from "./argParser.js"; -type ScopeType = +export type ScopeType = | "prefix" | "host" | "domain" @@ -304,73 +300,6 @@ export class ScopedSeed { } } -export async function parseSeeds(params: CrawlerArgs): Promise { - let seeds = params.seeds as string[]; - const scopedSeeds: ScopedSeed[] = []; - - if (params.seedFile) { - let seedFilePath = params.seedFile as string; - if ( - seedFilePath.startsWith("http://") || - seedFilePath.startsWith("https://") - ) { - seedFilePath = await collectOnlineSeedFile(seedFilePath); - } - - const urlSeedFile = fs.readFileSync(seedFilePath, "utf8"); - const urlSeedFileList = urlSeedFile.split("\n"); - - if (typeof seeds === "string") { - seeds = [seeds]; - } - - for (const seed of urlSeedFileList) { - if (seed) { - seeds.push(seed); - } - } - } - - const scopeOpts = { - scopeType: params.scopeType as ScopeType | undefined, - sitemap: params.sitemap, - include: params.include, - exclude: params.exclude, - depth: params.depth, - extraHops: params.extraHops, - }; - - for (const seed of seeds) { - const newSeed = typeof seed === "string" ? { url: seed } : seed; - newSeed.url = removeQuotes(newSeed.url); - - try { - scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed })); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } catch (e: any) { - logger.error("Failed to create seed", { - error: e.toString(), - ...scopeOpts, - ...newSeed, - }); - if (params.failOnFailedSeed) { - logger.fatal( - "Invalid seed specified, aborting crawl", - { url: newSeed.url }, - "general", - 1, - ); - } - } - } - - if (!params.qaSource && !scopedSeeds.length) { - logger.fatal("No valid seeds specified, aborting crawl"); - } - - return scopedSeeds; -} - export function rxEscape(string: string) { return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&"); } diff --git a/src/util/state.ts b/src/util/state.ts index 8960d9b6..adcf30b1 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -182,6 +182,8 @@ export type SaveState = { errors: string[]; extraSeeds: string[]; sitemapDone: boolean; + seedFileDone?: boolean; + seedFileSeeds?: string[]; }; // ============================================================================ @@ -206,6 +208,10 @@ export class RedisCrawlState { sitemapDoneKey: string; + seedFileDoneKey: string; + seedFileSeedsKey: string; + seedFileSeedsMap: string; + waczFilename: string | null = null; constructor( @@ -241,6 +247,10 @@ export class RedisCrawlState { this.sitemapDoneKey = this.key + ":sitemapDone"; + this.seedFileDoneKey = this.key + ":sfDone"; + this.seedFileSeedsKey = this.key + "sfSeeds"; + this.seedFileSeedsMap = this.key + ":sfMap"; + this._initLuaCommands(this.redis); } @@ -733,8 +743,10 @@ return inx; const pending = await this.getPendingList(); const failed = await this._iterListKeys(this.fkey, seen); const errors = await this.getErrorList(); + const seedFileSeeds = await this.getSeedFileSeeds(); const extraSeeds = await this._iterListKeys(this.esKey, seen); const sitemapDone = await this.isSitemapDone(); + const seedFileDone = await this.isSeedFileDone(); const finished = [...seen.values()]; @@ -744,6 +756,8 @@ return inx; queued, pending, sitemapDone, + seedFileDone, + seedFileSeeds, failed, errors, }; @@ -888,6 +902,10 @@ return inx; if (state.sitemapDone) { await this.markSitemapDone(); } + + if (state.seedFileDone) { + await this.markSeedFileDone(); + } } // backwards compatibility: not using done, instead 'finished' @@ -1032,6 +1050,14 @@ return inx; return await this.redis.lpush(this.pageskey, JSON.stringify(data)); } + async addSeedFileSeed(url: string) { + const ret = await this.redis.sadd(this.seedFileSeedsMap, url); + if (ret > 0) { + // Push to end of list to keep seeds in order for ids + await this.redis.rpush(this.seedFileSeedsKey, url); + } + } + // add extra seeds from redirect async addExtraSeed( seeds: ScopedSeed[], @@ -1085,6 +1111,10 @@ return inx; return seeds[newSeedId]; } + async getSeedFileSeeds() { + return await this.redis.lrange(this.seedFileSeedsKey, 0, -1); + } + async getExtraSeeds() { const seeds: ExtraRedirectSeed[] = []; const res = await this.redis.lrange(this.esKey, 0, -1); @@ -1106,4 +1136,12 @@ return inx; result.modified = this._timestamp(); await this.redis.set(`${this.key}:profileUploaded`, JSON.stringify(result)); } + + async isSeedFileDone() { + return (await this.redis.get(this.seedFileDoneKey)) == "1"; + } + + async markSeedFileDone() { + await this.redis.set(this.seedFileDoneKey, "1"); + } } diff --git a/tests/scopes.test.js b/tests/scopes.test.js index 9717fb11..58993fe5 100644 --- a/tests/scopes.test.js +++ b/tests/scopes.test.js @@ -1,5 +1,5 @@ import { parseArgs } from "../dist/util/argParser.js"; -import { parseSeeds } from "../dist/util/seeds.js"; +import { parseSeeds } from "../dist/util/parseseeds.js"; import fs from "fs"; diff --git a/tests/url_file_list.test.js b/tests/url_file_list.test.js index 6ceab34a..293c43fe 100644 --- a/tests/url_file_list.test.js +++ b/tests/url_file_list.test.js @@ -1,10 +1,23 @@ import util from "util"; -import { spawn, exec as execCallback } from "child_process"; +import { spawn, execSync, exec as execCallback } from "child_process"; import fs from "fs"; +import path from "path"; +import yaml from "js-yaml"; +import Redis from "ioredis"; const exec = util.promisify(execCallback); +const pagesFile = "test-crawls/collections/seed-file-restart-test/pages/pages.jsonl"; +const extraPagesFile = "test-crawls/collections/seed-file-restart-test/pages/extraPages.jsonl"; + +const expectedSeedFileSeeds = [ + "https://old.webrecorder.net/about/", + "https://specs.webrecorder.net/wacz/1.1.1/", + "https://old.webrecorder.net/faq" +]; + let proc = null; +let redisId = null; const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal"; const TEST_HOST = `http://${DOCKER_HOST_NAME}:31502`; @@ -20,6 +33,38 @@ afterAll(() => { }); +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function waitContainerDone(containerId) { + // containerId is initially the full id, but docker ps + // only prints the short id (first 12 characters) + containerId = containerId.slice(0, 12); + + while (true) { + try { + const res = execSync("docker ps -q", { encoding: "utf-8" }); + if (res.indexOf(containerId) < 0) { + return; + } + } catch (e) { + console.error(e); + } + await sleep(500); + } +} + +async function killContainer(containerId) { + try { + execSync(`docker kill -s SIGINT ${containerId}`); + } catch (e) { + return; + } + + await waitContainerDone(containerId); +} + test("check that URLs in seed-list are crawled", async () => { try { @@ -91,3 +136,182 @@ test("check that URLs in seed-list hosted at URL are crawled", async () => { } expect(foundSeedUrl).toBe(true); }); + + +let savedStateFile; +let finished; + +test("start crawl from seed list and then interrupt and save state when seeds have been crawled", async () => { + let containerId = null; + + try { + containerId = execSync( + `docker run -d -e CRAWL_ID=seedfiletest -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-restart-test --seedFile "${TEST_HOST}/urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1 --logging stats,debug`, + { encoding: "utf-8" }, + ); + } catch (error) { + console.log(error); + } + + // remove existing pagesFile to support reentrancy + try { + fs.unlinkSync(pagesFile); + } catch (e) { + // ignore + } + + while (true) { + try { + const pages = fs + .readFileSync(pagesFile, { encoding: "utf-8" }) + .trim() + .split("\n"); + + if (pages.length >= 4) { + break; + } + } catch (e) { + // ignore + } + + await sleep(500); + } + + await killContainer(containerId); + + const savedStates = fs.readdirSync( + "test-crawls/collections/seed-file-restart-test/crawls", + ); + expect(savedStates.length > 0).toEqual(true); + + savedStateFile = savedStates[savedStates.length - 1]; +}); + + +test("check saved state for seed file seeds", () => { + expect(savedStateFile).toBeTruthy(); + + const savedState = fs.readFileSync( + path.join("test-crawls/collections/seed-file-restart-test/crawls", savedStateFile), + "utf-8", + ); + + const saved = yaml.load(savedState); + + const state = saved.state; + finished = state.finished; + + const numDone = finished.length; + const numQueued = state.queued.length; + + expect(!!state).toBe(true); + expect(numDone > 0).toEqual(true); + expect(numQueued > 0).toEqual(true); + + // ensure seedFileDone is set + const seedFileDone = state.seedFileDone; + expect(seedFileDone).toEqual(true); + + // ensure seed file seeds are serialized in correct order + const seedFileSeeds = state.seedFileSeeds; + expect(seedFileSeeds.length).toEqual(3); + for (const [index, seed] of seedFileSeeds.entries()) { + expect(seed).toEqual(expectedSeedFileSeeds[index]); + } + + // ensure all of the seed file seeds are in finished + for (const seedUrl of seedFileSeeds) { + expect(finished.includes(seedUrl)).toEqual(true); + } +}); + + +test("check seed file seed crawl finishes successfully after resuming from saved state", async () => { + let containerId = null; + + const port = 36383; + + try { + containerId = execSync( + `docker run -d -p ${port}:6379 -e CRAWL_ID=seedfiletest -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-restart-test --debugAccessRedis --config /crawls/collections/seed-file-restart-test/crawls/${savedStateFile} --seedFile "${TEST_HOST}/urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1 --logging stats,debug`, + { encoding: "utf-8" }, + ); + } catch (error) { + console.log(error); + } + + await sleep(2000); + + const redis = new Redis(`redis://127.0.0.1:${port}/0`, { lazyConnect: true, retryStrategy: () => null }); + + try { + await redis.connect({ + maxRetriesPerRequest: 100, + }); + + await sleep(2000); + + for (const url of finished) { + const res = await redis.sismember("seedfiletest:s", url); + expect(res).toBe(1); + } + } catch (e) { + console.log(e); + } finally { + await waitContainerDone(containerId); + } +}); + +test("ensure all pages were crawled", async () => { + const pages = fs + .readFileSync(pagesFile, { encoding: "utf-8" }) + .trim() + .split("\n"); + + // first line is the header + expect(pages.length).toBe(4); + + const extraPages = fs + .readFileSync(extraPagesFile, { encoding: "utf-8" }) + .trim() + .split("\n"); + + // first line is the header + expect(extraPages.length).toBe(8); +}) + + +test("ensure that seed file seeds were pulled from Redis on restart", async () => { + const logDir = "test-crawls/collections/seed-file-restart-test/logs/"; + const logFiles = []; + fs.readdirSync(logDir).forEach((file) => { + if (file.endsWith(".log")) { + logFiles.push(path.join(logDir, file)); + } + }); + + expect(logFiles.length).toBeGreaterThan(0); + + const logFile = logFiles[logFiles.length - 1]; + const log = fs.readFileSync(logFile, { encoding: "utf-8" }).trim(); + + expect(log.indexOf("Seed file downloaded") > 0).toBe(false); + + expect( + log.indexOf( + '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/about/"}', + ) > 0, + ).toBe(true); + + expect( + log.indexOf( + '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}', + ) > 0, + ).toBe(true); + + expect( + log.indexOf( + '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/faq"}', + ) > 0, + ).toBe(true); +});