From ecb7d02718153f10239fc4b88c0f154214fbd4c3 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 14 Oct 2025 11:55:16 -0400 Subject: [PATCH 1/9] Fix logging for sitemap done --- src/crawler.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawler.ts b/src/crawler.ts index 04ecd9b3..2cc1592d 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -2652,7 +2652,7 @@ self.__bx_behaviors.selectMainBehavior(); } if (await this.crawlState.isSitemapDone()) { - logger.info("Sitemap already processed, skipping", "sitemap"); + logger.info("Sitemap already processed, skipping", {}, "sitemap"); return; } From df36817caf6a4b922c2f2e5da3ee83f522cf3171 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 14 Oct 2025 12:02:01 -0400 Subject: [PATCH 2/9] Don't attempt to fetch or parse seed file again after succeeding Use hacky any to avoid circular import, will fix properly in later commit --- src/crawler.ts | 2 +- src/util/logger.ts | 1 + src/util/seeds.ts | 14 ++++++++++++-- src/util/state.ts | 17 +++++++++++++++++ 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 2cc1592d..21384591 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -513,7 +513,7 @@ export class Crawler { this.proxyServer = res.proxyServer; this.proxyPacUrl = res.proxyPacUrl; - this.seeds = await parseSeeds(this.params); + this.seeds = await parseSeeds(this.params, this.crawlState); this.numOriginalSeeds = this.seeds.length; logger.info("Seeds", this.seeds); diff --git a/src/util/logger.ts b/src/util/logger.ts index 7d10939e..3d0c7a77 100644 --- a/src/util/logger.ts +++ b/src/util/logger.ts @@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [ "replay", "proxy", "scope", + "seedFile", ] as const; export type LogContext = (typeof LOG_CONTEXT_TYPES)[number]; diff --git a/src/util/seeds.ts b/src/util/seeds.ts index efdb0b8d..dc63a4c1 100644 --- a/src/util/seeds.ts +++ b/src/util/seeds.ts @@ -304,11 +304,17 @@ export class ScopedSeed { } } -export async function parseSeeds(params: CrawlerArgs): Promise { +export async function parseSeeds( + params: CrawlerArgs, + // eslint-disable-next-line @typescript-eslint/no-explicit-any + crawlState: any, +): Promise { let seeds = params.seeds as string[]; const scopedSeeds: ScopedSeed[] = []; - if (params.seedFile) { + if (params.seedFile && (await crawlState.isSeedFileDone())) { + logger.info("Seed file already processed, skipping", {}, "seedFile"); + } else if (params.seedFile) { let seedFilePath = params.seedFile as string; if ( seedFilePath.startsWith("http://") || @@ -368,6 +374,10 @@ export async function parseSeeds(params: CrawlerArgs): Promise { logger.fatal("No valid seeds specified, aborting crawl"); } + if (params.seedFile) { + await crawlState.markSeedFileDone(); + } + return scopedSeeds; } diff --git a/src/util/state.ts b/src/util/state.ts index 8960d9b6..20a9ce48 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -182,6 +182,7 @@ export type SaveState = { errors: string[]; extraSeeds: string[]; sitemapDone: boolean; + seedFileDone: boolean; }; // ============================================================================ @@ -205,6 +206,7 @@ export class RedisCrawlState { esMap: string; sitemapDoneKey: string; + seedFileDoneKey: string; waczFilename: string | null = null; @@ -240,6 +242,7 @@ export class RedisCrawlState { this.esMap = this.key + ":esMap"; this.sitemapDoneKey = this.key + ":sitemapDone"; + this.seedFileDoneKey = this.key + ":seedFileDone"; this._initLuaCommands(this.redis); } @@ -735,6 +738,7 @@ return inx; const errors = await this.getErrorList(); const extraSeeds = await this._iterListKeys(this.esKey, seen); const sitemapDone = await this.isSitemapDone(); + const seedFileDone = await this.isSeedFileDone(); const finished = [...seen.values()]; @@ -744,6 +748,7 @@ return inx; queued, pending, sitemapDone, + seedFileDone, failed, errors, }; @@ -888,6 +893,10 @@ return inx; if (state.sitemapDone) { await this.markSitemapDone(); } + + if (state.seedFileDone) { + await this.markSeedFileDone(); + } } // backwards compatibility: not using done, instead 'finished' @@ -1106,4 +1115,12 @@ return inx; result.modified = this._timestamp(); await this.redis.set(`${this.key}:profileUploaded`, JSON.stringify(result)); } + + async isSeedFileDone() { + return (await this.redis.get(this.seedFileDoneKey)) == "1"; + } + + async markSeedFileDone() { + await this.redis.set(this.seedFileDoneKey, "1"); + } } From 35d3052b0abeef718e1b693e440c505fb2f12641 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 4 Nov 2025 10:47:33 -0500 Subject: [PATCH 3/9] Store seed file seeds in Redis after initial file read Also move parseSeeds to separate module to avoid circular import --- src/crawler.ts | 3 +- src/util/parseseeds.ts | 113 +++++++++++++++++++++++++++++++++++++++++ src/util/seeds.ts | 83 +----------------------------- src/util/state.ts | 36 ++++++++++++- tests/scopes.test.js | 2 +- 5 files changed, 152 insertions(+), 85 deletions(-) create mode 100644 src/util/parseseeds.ts diff --git a/src/crawler.ts b/src/crawler.ts index 21384591..8f5680de 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -62,7 +62,8 @@ import { } from "puppeteer-core"; import { Recorder } from "./util/recorder.js"; import { SitemapReader } from "./util/sitemapper.js"; -import { ScopedSeed, parseSeeds } from "./util/seeds.js"; +import { ScopedSeed } from "./util/seeds.js"; +import { parseSeeds } from "./util/parseseeds.js"; import { WARCWriter, createWARCInfo, diff --git a/src/util/parseseeds.ts b/src/util/parseseeds.ts new file mode 100644 index 00000000..d2c787d4 --- /dev/null +++ b/src/util/parseseeds.ts @@ -0,0 +1,113 @@ +import fs from "fs"; + +import { collectOnlineSeedFile } from "./file_reader.js"; +import { logger } from "./logger.js"; +import { type CrawlerArgs } from "./argParser.js"; +import { ScopedSeed, removeQuotes, type ScopeType } from "./seeds.js"; +import { type RedisCrawlState } from "./state.js"; + +export async function parseSeeds( + params: CrawlerArgs, + crawlState: RedisCrawlState, +): Promise { + let seeds = params.seeds as string[]; + const scopedSeeds: ScopedSeed[] = []; + + const seedFileDone = await crawlState.isSeedFileDone(); + + if (params.seedFile && !seedFileDone) { + let seedFilePath = params.seedFile as string; + if ( + seedFilePath.startsWith("http://") || + seedFilePath.startsWith("https://") + ) { + seedFilePath = await collectOnlineSeedFile(seedFilePath); + } + + const urlSeedFile = fs.readFileSync(seedFilePath, "utf8"); + const urlSeedFileList = urlSeedFile.split("\n"); + + if (typeof seeds === "string") { + seeds = [seeds]; + } + + for (const seed of urlSeedFileList) { + if (seed) { + seeds.push(seed); + } + } + } + + const scopeOpts = { + scopeType: params.scopeType as ScopeType | undefined, + sitemap: params.sitemap, + include: params.include, + exclude: params.exclude, + depth: params.depth, + extraHops: params.extraHops, + }; + + for (const seed of seeds) { + const newSeed = typeof seed === "string" ? { url: seed } : seed; + newSeed.url = removeQuotes(newSeed.url); + + try { + const scopedSeed = new ScopedSeed({ ...scopeOpts, ...newSeed }); + scopedSeeds.push(scopedSeed); + if (params.seedFile) { + await crawlState.addSeedFileSeed(scopedSeed); + logger.debug( + "Pushed seed file seed to Redis", + { url: scopedSeed.url }, + "seedFile", + ); + } + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } catch (e: any) { + logger.error("Failed to create seed", { + error: e.toString(), + ...scopeOpts, + ...newSeed, + }); + if (params.failOnFailedSeed) { + logger.fatal( + "Invalid seed specified, aborting crawl", + { url: newSeed.url }, + "general", + 1, + ); + } + } + } + + if (params.seedFile && seedFileDone) { + const seedFileScopedSeeds = await crawlState.getSeedFileSeeds(); + for (const seed of seedFileScopedSeeds) { + logger.debug( + "Pulled seed file seed from Redis", + { url: seed.url }, + "seedFile", + ); + try { + const scopedSeed = new ScopedSeed({ ...scopeOpts, url: seed.url }); + scopedSeeds.push(scopedSeed); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } catch (e: any) { + logger.error("Failed to create seed from Redis", { + error: e.toString(), + ...seed, + }); + } + } + } + + if (!params.qaSource && !scopedSeeds.length) { + logger.fatal("No valid seeds specified, aborting crawl"); + } + + if (params.seedFile) { + await crawlState.markSeedFileDone(); + } + + return scopedSeeds; +} diff --git a/src/util/seeds.ts b/src/util/seeds.ts index dc63a4c1..4b02365b 100644 --- a/src/util/seeds.ts +++ b/src/util/seeds.ts @@ -1,11 +1,7 @@ -import fs from "fs"; - import { MAX_DEPTH } from "./constants.js"; -import { collectOnlineSeedFile } from "./file_reader.js"; import { logger } from "./logger.js"; -import { type CrawlerArgs } from "./argParser.js"; -type ScopeType = +export type ScopeType = | "prefix" | "host" | "domain" @@ -304,83 +300,6 @@ export class ScopedSeed { } } -export async function parseSeeds( - params: CrawlerArgs, - // eslint-disable-next-line @typescript-eslint/no-explicit-any - crawlState: any, -): Promise { - let seeds = params.seeds as string[]; - const scopedSeeds: ScopedSeed[] = []; - - if (params.seedFile && (await crawlState.isSeedFileDone())) { - logger.info("Seed file already processed, skipping", {}, "seedFile"); - } else if (params.seedFile) { - let seedFilePath = params.seedFile as string; - if ( - seedFilePath.startsWith("http://") || - seedFilePath.startsWith("https://") - ) { - seedFilePath = await collectOnlineSeedFile(seedFilePath); - } - - const urlSeedFile = fs.readFileSync(seedFilePath, "utf8"); - const urlSeedFileList = urlSeedFile.split("\n"); - - if (typeof seeds === "string") { - seeds = [seeds]; - } - - for (const seed of urlSeedFileList) { - if (seed) { - seeds.push(seed); - } - } - } - - const scopeOpts = { - scopeType: params.scopeType as ScopeType | undefined, - sitemap: params.sitemap, - include: params.include, - exclude: params.exclude, - depth: params.depth, - extraHops: params.extraHops, - }; - - for (const seed of seeds) { - const newSeed = typeof seed === "string" ? { url: seed } : seed; - newSeed.url = removeQuotes(newSeed.url); - - try { - scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed })); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - } catch (e: any) { - logger.error("Failed to create seed", { - error: e.toString(), - ...scopeOpts, - ...newSeed, - }); - if (params.failOnFailedSeed) { - logger.fatal( - "Invalid seed specified, aborting crawl", - { url: newSeed.url }, - "general", - 1, - ); - } - } - } - - if (!params.qaSource && !scopedSeeds.length) { - logger.fatal("No valid seeds specified, aborting crawl"); - } - - if (params.seedFile) { - await crawlState.markSeedFileDone(); - } - - return scopedSeeds; -} - export function rxEscape(string: string) { return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&"); } diff --git a/src/util/state.ts b/src/util/state.ts index 20a9ce48..34c30cfc 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -183,6 +183,7 @@ export type SaveState = { extraSeeds: string[]; sitemapDone: boolean; seedFileDone: boolean; + seedFileSeeds: string[]; }; // ============================================================================ @@ -206,7 +207,10 @@ export class RedisCrawlState { esMap: string; sitemapDoneKey: string; + seedFileDoneKey: string; + seedFileSeedsKey: string; + seedFileSeedsMap: string; waczFilename: string | null = null; @@ -242,7 +246,10 @@ export class RedisCrawlState { this.esMap = this.key + ":esMap"; this.sitemapDoneKey = this.key + ":sitemapDone"; - this.seedFileDoneKey = this.key + ":seedFileDone"; + + this.seedFileDoneKey = this.key + ":sfDone"; + this.seedFileSeedsKey = this.key + "sfSeeds"; + this.seedFileSeedsMap = this.key + ":sfMap"; this._initLuaCommands(this.redis); } @@ -736,6 +743,7 @@ return inx; const pending = await this.getPendingList(); const failed = await this._iterListKeys(this.fkey, seen); const errors = await this.getErrorList(); + const seedFileSeeds = await this._iterListKeys(this.seedFileSeedsKey, seen); const extraSeeds = await this._iterListKeys(this.esKey, seen); const sitemapDone = await this.isSitemapDone(); const seedFileDone = await this.isSeedFileDone(); @@ -749,6 +757,7 @@ return inx; pending, sitemapDone, seedFileDone, + seedFileSeeds, failed, errors, }; @@ -846,6 +855,13 @@ return inx; await this.redis.set(this.dkey, state.finished.length); } + if (state.seedFileSeeds) { + for (const seed of state.seedFileSeeds) { + const scopedSeed: ScopedSeed = JSON.parse(seed); + await this.addSeedFileSeed(scopedSeed); + } + } + if (state.extraSeeds) { const origLen = seeds.length; @@ -1041,6 +1057,14 @@ return inx; return await this.redis.lpush(this.pageskey, JSON.stringify(data)); } + async addSeedFileSeed(seed: ScopedSeed) { + const ret = await this.redis.sadd(this.seedFileSeedsMap, seed.url); + if (ret > 0) { + // Push to end of list to keep seeds in order for ids + await this.redis.rpush(this.seedFileSeedsKey, JSON.stringify(seed)); + } + } + // add extra seeds from redirect async addExtraSeed( seeds: ScopedSeed[], @@ -1094,6 +1118,16 @@ return inx; return seeds[newSeedId]; } + async getSeedFileSeeds() { + const seeds: ScopedSeed[] = []; + + const res = await this.redis.lrange(this.seedFileSeedsKey, 0, -1); + for (const key of res) { + seeds.push(JSON.parse(key)); + } + return seeds; + } + async getExtraSeeds() { const seeds: ExtraRedirectSeed[] = []; const res = await this.redis.lrange(this.esKey, 0, -1); diff --git a/tests/scopes.test.js b/tests/scopes.test.js index 9717fb11..58993fe5 100644 --- a/tests/scopes.test.js +++ b/tests/scopes.test.js @@ -1,5 +1,5 @@ import { parseArgs } from "../dist/util/argParser.js"; -import { parseSeeds } from "../dist/util/seeds.js"; +import { parseSeeds } from "../dist/util/parseseeds.js"; import fs from "fs"; From 17b6b23f3b92a5350bea3d6bf252ddc363c2114a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 4 Nov 2025 17:52:08 -0500 Subject: [PATCH 4/9] Support re-adding seed file seeds from serialized state, fix typing Allow crawlState to be undefined in parseSeeds for use in scope tests --- src/util/parseseeds.ts | 32 +++++++++++++++++++++++++++----- src/util/state.ts | 7 ------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/src/util/parseseeds.ts b/src/util/parseseeds.ts index d2c787d4..81f00fff 100644 --- a/src/util/parseseeds.ts +++ b/src/util/parseseeds.ts @@ -8,12 +8,33 @@ import { type RedisCrawlState } from "./state.js"; export async function parseSeeds( params: CrawlerArgs, - crawlState: RedisCrawlState, + crawlState?: RedisCrawlState, ): Promise { let seeds = params.seeds as string[]; const scopedSeeds: ScopedSeed[] = []; - const seedFileDone = await crawlState.isSeedFileDone(); + // Re-add seedFileDone from serialized state to Redis if present + if (params.state && params.state.seedFileDone && crawlState) { + await crawlState.markSeedFileDone(); + } + + let seedFileDone = false; + if (crawlState) { + seedFileDone = await crawlState.isSeedFileDone(); + } + + // Re-add any seeds from seed files from serialized state to Redis + if ( + params.state && + params.state.seedFileSeeds && + seedFileDone && + crawlState + ) { + for (const seed of params.state.seedFileSeeds) { + const scopedSeed: ScopedSeed = JSON.parse(seed); + await crawlState.addSeedFileSeed(scopedSeed); + } + } if (params.seedFile && !seedFileDone) { let seedFilePath = params.seedFile as string; @@ -54,7 +75,7 @@ export async function parseSeeds( try { const scopedSeed = new ScopedSeed({ ...scopeOpts, ...newSeed }); scopedSeeds.push(scopedSeed); - if (params.seedFile) { + if (params.seedFile && !seedFileDone && crawlState) { await crawlState.addSeedFileSeed(scopedSeed); logger.debug( "Pushed seed file seed to Redis", @@ -80,7 +101,8 @@ export async function parseSeeds( } } - if (params.seedFile && seedFileDone) { + // If seed file was already successfully parsed, re-add seeds from Redis + if (params.seedFile && seedFileDone && crawlState) { const seedFileScopedSeeds = await crawlState.getSeedFileSeeds(); for (const seed of seedFileScopedSeeds) { logger.debug( @@ -105,7 +127,7 @@ export async function parseSeeds( logger.fatal("No valid seeds specified, aborting crawl"); } - if (params.seedFile) { + if (params.seedFile && crawlState) { await crawlState.markSeedFileDone(); } diff --git a/src/util/state.ts b/src/util/state.ts index 34c30cfc..76e62c52 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -855,13 +855,6 @@ return inx; await this.redis.set(this.dkey, state.finished.length); } - if (state.seedFileSeeds) { - for (const seed of state.seedFileSeeds) { - const scopedSeed: ScopedSeed = JSON.parse(seed); - await this.addSeedFileSeed(scopedSeed); - } - } - if (state.extraSeeds) { const origLen = seeds.length; From df395a74360856696c56fac3de7e5f87ac2f4505 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 6 Nov 2025 11:17:53 -0500 Subject: [PATCH 5/9] Simplify to only storing seed file seed URLs and add tests The only information in seed files is the URL, so no need to complicate things by storing more than that. --- src/util/parseseeds.ts | 17 ++- src/util/state.ts | 16 +-- tests/url_file_list.test.js | 217 +++++++++++++++++++++++++++++++++++- 3 files changed, 229 insertions(+), 21 deletions(-) diff --git a/src/util/parseseeds.ts b/src/util/parseseeds.ts index 81f00fff..2e925dd0 100644 --- a/src/util/parseseeds.ts +++ b/src/util/parseseeds.ts @@ -30,9 +30,8 @@ export async function parseSeeds( seedFileDone && crawlState ) { - for (const seed of params.state.seedFileSeeds) { - const scopedSeed: ScopedSeed = JSON.parse(seed); - await crawlState.addSeedFileSeed(scopedSeed); + for (const seedUrl of params.state.seedFileSeeds) { + await crawlState.addSeedFileSeed(seedUrl); } } @@ -76,7 +75,7 @@ export async function parseSeeds( const scopedSeed = new ScopedSeed({ ...scopeOpts, ...newSeed }); scopedSeeds.push(scopedSeed); if (params.seedFile && !seedFileDone && crawlState) { - await crawlState.addSeedFileSeed(scopedSeed); + await crawlState.addSeedFileSeed(scopedSeed.url); logger.debug( "Pushed seed file seed to Redis", { url: scopedSeed.url }, @@ -103,21 +102,21 @@ export async function parseSeeds( // If seed file was already successfully parsed, re-add seeds from Redis if (params.seedFile && seedFileDone && crawlState) { - const seedFileScopedSeeds = await crawlState.getSeedFileSeeds(); - for (const seed of seedFileScopedSeeds) { + const seedFileSeedUrls = await crawlState.getSeedFileSeeds(); + for (const seedUrl of seedFileSeedUrls) { logger.debug( "Pulled seed file seed from Redis", - { url: seed.url }, + { url: seedUrl }, "seedFile", ); try { - const scopedSeed = new ScopedSeed({ ...scopeOpts, url: seed.url }); + const scopedSeed = new ScopedSeed({ ...scopeOpts, url: seedUrl }); scopedSeeds.push(scopedSeed); // eslint-disable-next-line @typescript-eslint/no-explicit-any } catch (e: any) { logger.error("Failed to create seed from Redis", { error: e.toString(), - ...seed, + url: seedUrl, }); } } diff --git a/src/util/state.ts b/src/util/state.ts index 76e62c52..0a75efb2 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -743,7 +743,7 @@ return inx; const pending = await this.getPendingList(); const failed = await this._iterListKeys(this.fkey, seen); const errors = await this.getErrorList(); - const seedFileSeeds = await this._iterListKeys(this.seedFileSeedsKey, seen); + const seedFileSeeds = await this.getSeedFileSeeds(); const extraSeeds = await this._iterListKeys(this.esKey, seen); const sitemapDone = await this.isSitemapDone(); const seedFileDone = await this.isSeedFileDone(); @@ -1050,11 +1050,11 @@ return inx; return await this.redis.lpush(this.pageskey, JSON.stringify(data)); } - async addSeedFileSeed(seed: ScopedSeed) { - const ret = await this.redis.sadd(this.seedFileSeedsMap, seed.url); + async addSeedFileSeed(url: string) { + const ret = await this.redis.sadd(this.seedFileSeedsMap, url); if (ret > 0) { // Push to end of list to keep seeds in order for ids - await this.redis.rpush(this.seedFileSeedsKey, JSON.stringify(seed)); + await this.redis.rpush(this.seedFileSeedsKey, url); } } @@ -1112,13 +1112,7 @@ return inx; } async getSeedFileSeeds() { - const seeds: ScopedSeed[] = []; - - const res = await this.redis.lrange(this.seedFileSeedsKey, 0, -1); - for (const key of res) { - seeds.push(JSON.parse(key)); - } - return seeds; + return await this.redis.lrange(this.seedFileSeedsKey, 0, -1); } async getExtraSeeds() { diff --git a/tests/url_file_list.test.js b/tests/url_file_list.test.js index 6ceab34a..96539365 100644 --- a/tests/url_file_list.test.js +++ b/tests/url_file_list.test.js @@ -1,10 +1,23 @@ import util from "util"; -import { spawn, exec as execCallback } from "child_process"; +import { spawn, execSync, exec as execCallback } from "child_process"; import fs from "fs"; +import path from "path"; +import yaml from "js-yaml"; +import Redis from "ioredis"; const exec = util.promisify(execCallback); +const pagesFile = "test-crawls/collections/seed-file-restart-test/pages/pages.jsonl"; +const extraPagesFile = "test-crawls/collections/seed-file-restart-test/pages/extraPages.jsonl"; + +const expectedSeedFileSeeds = [ + "https://old.webrecorder.net/about/", + "https://specs.webrecorder.net/wacz/1.1.1/", + "https://old.webrecorder.net/faq" +]; + let proc = null; +let redisId = null; const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal"; const TEST_HOST = `http://${DOCKER_HOST_NAME}:31502`; @@ -20,6 +33,38 @@ afterAll(() => { }); +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function waitContainerDone(containerId) { + // containerId is initially the full id, but docker ps + // only prints the short id (first 12 characters) + containerId = containerId.slice(0, 12); + + while (true) { + try { + const res = execSync("docker ps -q", { encoding: "utf-8" }); + if (res.indexOf(containerId) < 0) { + return; + } + } catch (e) { + console.error(e); + } + await sleep(500); + } +} + +async function killContainer(containerId) { + try { + execSync(`docker kill -s SIGINT ${containerId}`); + } catch (e) { + return; + } + + await waitContainerDone(containerId); +} + test("check that URLs in seed-list are crawled", async () => { try { @@ -91,3 +136,173 @@ test("check that URLs in seed-list hosted at URL are crawled", async () => { } expect(foundSeedUrl).toBe(true); }); + + +let savedStateFile; +let finished; + +test("start crawl from seed list and then interrupt and save state when seeds have been crawled", async () => { + let containerId = null; + + try { + containerId = execSync( + `docker run -d -e CRAWL_ID=seedfiletest -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-restart-test --seedFile "${TEST_HOST}/urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1 --logging stats,debug`, + { encoding: "utf-8" }, + ); + } catch (error) { + console.log(error); + } + + // remove existing pagesFile to support reentrancy + try { + fs.unlinkSync(pagesFile); + } catch (e) { + // ignore + } + + while (true) { + try { + const pages = fs + .readFileSync(pagesFile, { encoding: "utf-8" }) + .trim() + .split("\n"); + + if (pages.length >= 4) { + break; + } + } catch (e) { + // ignore + } + + await sleep(500); + } + + await killContainer(containerId); + + const savedStates = fs.readdirSync( + "test-crawls/collections/seed-file-restart-test/crawls", + ); + expect(savedStates.length > 0).toEqual(true); + + savedStateFile = savedStates[savedStates.length - 1]; +}); + + +test("check saved state for seed file seeds", () => { + expect(savedStateFile).toBeTruthy(); + + const savedState = fs.readFileSync( + path.join("test-crawls/collections/seed-file-restart-test/crawls", savedStateFile), + "utf-8", + ); + + const saved = yaml.load(savedState); + + const state = saved.state; + finished = state.finished; + + const numDone = finished.length; + const numQueued = state.queued.length; + + expect(!!state).toBe(true); + expect(numDone > 0).toEqual(true); + expect(numQueued > 0).toEqual(true); + + const seedFileDone = state.seedFileDone; + expect(seedFileDone).toEqual(true); + + const seedFileSeeds = state.seedFileSeeds; + expect(seedFileSeeds.length).toEqual(3); + for (const [index, seed] of seedFileSeeds.entries()) { + expect(seed).toEqual(expectedSeedFileSeeds[index]); + } +}); + + +test("check seed file seed crawl finishes successfully after resuming from saved state", async () => { + let containerId = null; + + const port = 36383; + + try { + containerId = execSync( + `docker run -d -p ${port}:6379 -e CRAWL_ID=seedfiletest -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-restart-test --debugAccessRedis --config /crawls/collections/seed-file-restart-test/crawls/${savedStateFile} --seedFile "${TEST_HOST}/urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1 --logging stats,debug`, + { encoding: "utf-8" }, + ); + } catch (error) { + console.log(error); + } + + await sleep(2000); + + const redis = new Redis(`redis://127.0.0.1:${port}/0`, { lazyConnect: true, retryStrategy: () => null }); + + try { + await redis.connect({ + maxRetriesPerRequest: 100, + }); + + await sleep(2000); + + for (const url of finished) { + const res = await redis.sismember("seedfiletest:s", url); + expect(res).toBe(1); + } + } catch (e) { + console.log(e); + } finally { + await waitContainerDone(containerId); + } +}); + +test("ensure all pages were crawled", async () => { + const pages = fs + .readFileSync(pagesFile, { encoding: "utf-8" }) + .trim() + .split("\n"); + + // first line is the header + expect(pages.length).toBe(4); + + const extraPages = fs + .readFileSync(extraPagesFile, { encoding: "utf-8" }) + .trim() + .split("\n"); + + // first line is the header + expect(extraPages.length).toBe(8); +}) + + +test("ensure that seed file seeds were pulled from Redis on restart", async () => { + const logDir = "test-crawls/collections/seed-file-restart-test/logs/"; + const logFiles = []; + fs.readdirSync(logDir).forEach((file) => { + if (file.endsWith(".log")) { + logFiles.push(path.join(logDir, file)); + } + }); + + expect(logFiles.length).toBeGreaterThan(0); + + const logFile = logFiles[logFiles.length - 1]; + const log = fs.readFileSync(logFile, { encoding: "utf-8" }).trim(); + + expect( + log.indexOf( + '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/about/"}', + ) > 0, + ).toBe(true); + + expect( + log.indexOf( + '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}', + ) > 0, + ).toBe(true); + + expect( + log.indexOf( + '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/faq"}', + ) > 0, + ).toBe(true); +}); From 4f9f56a63ea92fff94dc8a49123a4767a4055178 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 19 Nov 2025 18:00:02 -0500 Subject: [PATCH 6/9] Test that seed file seeds are in finished --- tests/url_file_list.test.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/url_file_list.test.js b/tests/url_file_list.test.js index 96539365..3edf88f4 100644 --- a/tests/url_file_list.test.js +++ b/tests/url_file_list.test.js @@ -216,6 +216,10 @@ test("check saved state for seed file seeds", () => { for (const [index, seed] of seedFileSeeds.entries()) { expect(seed).toEqual(expectedSeedFileSeeds[index]); } + + for (const [index, seed] of finished.entries()) { + expect(seed).toEqual(expectedSeedFileSeeds[index]); + } }); From 3b87f11286f93119f15b82b1bf0d462957348692 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 19 Nov 2025 18:04:23 -0500 Subject: [PATCH 7/9] Make sure seed file isn't re-downloaded in test --- tests/url_file_list.test.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/url_file_list.test.js b/tests/url_file_list.test.js index 3edf88f4..6129527e 100644 --- a/tests/url_file_list.test.js +++ b/tests/url_file_list.test.js @@ -292,6 +292,8 @@ test("ensure that seed file seeds were pulled from Redis on restart", async () = const logFile = logFiles[logFiles.length - 1]; const log = fs.readFileSync(logFile, { encoding: "utf-8" }).trim(); + expect(log.indexOf("Seed file downloaded") > 0).toBe(false); + expect( log.indexOf( '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/about/"}', From 0c7e2ce37e770942f8bf390dc9d3403ecf1a0df7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 19 Nov 2025 18:15:13 -0500 Subject: [PATCH 8/9] Tweak test, order in finished doesn't matter --- tests/url_file_list.test.js | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/url_file_list.test.js b/tests/url_file_list.test.js index 6129527e..293c43fe 100644 --- a/tests/url_file_list.test.js +++ b/tests/url_file_list.test.js @@ -208,17 +208,20 @@ test("check saved state for seed file seeds", () => { expect(numDone > 0).toEqual(true); expect(numQueued > 0).toEqual(true); + // ensure seedFileDone is set const seedFileDone = state.seedFileDone; expect(seedFileDone).toEqual(true); + // ensure seed file seeds are serialized in correct order const seedFileSeeds = state.seedFileSeeds; expect(seedFileSeeds.length).toEqual(3); for (const [index, seed] of seedFileSeeds.entries()) { expect(seed).toEqual(expectedSeedFileSeeds[index]); } - for (const [index, seed] of finished.entries()) { - expect(seed).toEqual(expectedSeedFileSeeds[index]); + // ensure all of the seed file seeds are in finished + for (const seedUrl of seedFileSeeds) { + expect(finished.includes(seedUrl)).toEqual(true); } }); From 510f81ad451b09819da27b6a9f82a8c6ff33ab7b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 19 Nov 2025 19:15:28 -0500 Subject: [PATCH 9/9] Update SaveState type --- src/util/state.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/state.ts b/src/util/state.ts index 0a75efb2..adcf30b1 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -182,8 +182,8 @@ export type SaveState = { errors: string[]; extraSeeds: string[]; sitemapDone: boolean; - seedFileDone: boolean; - seedFileSeeds: string[]; + seedFileDone?: boolean; + seedFileSeeds?: string[]; }; // ============================================================================