Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ import {
} from "puppeteer-core";
import { Recorder } from "./util/recorder.js";
import { SitemapReader } from "./util/sitemapper.js";
import { ScopedSeed, parseSeeds } from "./util/seeds.js";
import { ScopedSeed } from "./util/seeds.js";
import { parseSeeds } from "./util/parseseeds.js";
import {
WARCWriter,
createWARCInfo,
Expand Down Expand Up @@ -513,7 +514,7 @@ export class Crawler {
this.proxyServer = res.proxyServer;
this.proxyPacUrl = res.proxyPacUrl;

this.seeds = await parseSeeds(this.params);
this.seeds = await parseSeeds(this.params, this.crawlState);
this.numOriginalSeeds = this.seeds.length;

logger.info("Seeds", this.seeds);
Expand Down Expand Up @@ -2652,7 +2653,7 @@ self.__bx_behaviors.selectMainBehavior();
}

if (await this.crawlState.isSitemapDone()) {
logger.info("Sitemap already processed, skipping", "sitemap");
logger.info("Sitemap already processed, skipping", {}, "sitemap");
return;
}

Expand Down
1 change: 1 addition & 0 deletions src/util/logger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [
"replay",
"proxy",
"scope",
"seedFile",
] as const;

export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
Expand Down
134 changes: 134 additions & 0 deletions src/util/parseseeds.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import fs from "fs";

import { collectOnlineSeedFile } from "./file_reader.js";
import { logger } from "./logger.js";
import { type CrawlerArgs } from "./argParser.js";
import { ScopedSeed, removeQuotes, type ScopeType } from "./seeds.js";
import { type RedisCrawlState } from "./state.js";

export async function parseSeeds(
params: CrawlerArgs,
crawlState?: RedisCrawlState,
): Promise<ScopedSeed[]> {
let seeds = params.seeds as string[];
const scopedSeeds: ScopedSeed[] = [];

// Re-add seedFileDone from serialized state to Redis if present
if (params.state && params.state.seedFileDone && crawlState) {
await crawlState.markSeedFileDone();
}

let seedFileDone = false;
if (crawlState) {
seedFileDone = await crawlState.isSeedFileDone();
}

// Re-add any seeds from seed files from serialized state to Redis
if (
params.state &&
params.state.seedFileSeeds &&
seedFileDone &&
crawlState
) {
for (const seedUrl of params.state.seedFileSeeds) {
await crawlState.addSeedFileSeed(seedUrl);
}
}

if (params.seedFile && !seedFileDone) {
let seedFilePath = params.seedFile as string;
if (
seedFilePath.startsWith("http://") ||
seedFilePath.startsWith("https://")
) {
seedFilePath = await collectOnlineSeedFile(seedFilePath);
}

const urlSeedFile = fs.readFileSync(seedFilePath, "utf8");
const urlSeedFileList = urlSeedFile.split("\n");

if (typeof seeds === "string") {
seeds = [seeds];
}

for (const seed of urlSeedFileList) {
if (seed) {
seeds.push(seed);
}
}
}

const scopeOpts = {
scopeType: params.scopeType as ScopeType | undefined,
sitemap: params.sitemap,
include: params.include,
exclude: params.exclude,
depth: params.depth,
extraHops: params.extraHops,
};

for (const seed of seeds) {
const newSeed = typeof seed === "string" ? { url: seed } : seed;
newSeed.url = removeQuotes(newSeed.url);

try {
const scopedSeed = new ScopedSeed({ ...scopeOpts, ...newSeed });
scopedSeeds.push(scopedSeed);
if (params.seedFile && !seedFileDone && crawlState) {
await crawlState.addSeedFileSeed(scopedSeed.url);
logger.debug(
"Pushed seed file seed to Redis",
{ url: scopedSeed.url },
"seedFile",
);
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
logger.error("Failed to create seed", {
error: e.toString(),
...scopeOpts,
...newSeed,
});
if (params.failOnFailedSeed) {
logger.fatal(
"Invalid seed specified, aborting crawl",
{ url: newSeed.url },
"general",
1,
);
}
}
}

// If seed file was already successfully parsed, re-add seeds from Redis
if (params.seedFile && seedFileDone && crawlState) {
const seedFileSeedUrls = await crawlState.getSeedFileSeeds();
for (const seedUrl of seedFileSeedUrls) {
logger.debug(
"Pulled seed file seed from Redis",
{ url: seedUrl },
"seedFile",
);
try {
const scopedSeed = new ScopedSeed({ ...scopeOpts, url: seedUrl });
scopedSeeds.push(scopedSeed);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
logger.error("Failed to create seed from Redis", {
error: e.toString(),
url: seedUrl,
});
}
}
}

if (!params.qaSource && !scopedSeeds.length) {
logger.fatal("No valid seeds specified, aborting crawl");
}

if (params.seedFile && crawlState) {
await crawlState.markSeedFileDone();
}

return scopedSeeds;
}
73 changes: 1 addition & 72 deletions src/util/seeds.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
import fs from "fs";

import { MAX_DEPTH } from "./constants.js";
import { collectOnlineSeedFile } from "./file_reader.js";
import { logger } from "./logger.js";
import { type CrawlerArgs } from "./argParser.js";

type ScopeType =
export type ScopeType =
| "prefix"
| "host"
| "domain"
Expand Down Expand Up @@ -304,73 +300,6 @@ export class ScopedSeed {
}
}

export async function parseSeeds(params: CrawlerArgs): Promise<ScopedSeed[]> {
let seeds = params.seeds as string[];
const scopedSeeds: ScopedSeed[] = [];

if (params.seedFile) {
let seedFilePath = params.seedFile as string;
if (
seedFilePath.startsWith("http://") ||
seedFilePath.startsWith("https://")
) {
seedFilePath = await collectOnlineSeedFile(seedFilePath);
}

const urlSeedFile = fs.readFileSync(seedFilePath, "utf8");
const urlSeedFileList = urlSeedFile.split("\n");

if (typeof seeds === "string") {
seeds = [seeds];
}

for (const seed of urlSeedFileList) {
if (seed) {
seeds.push(seed);
}
}
}

const scopeOpts = {
scopeType: params.scopeType as ScopeType | undefined,
sitemap: params.sitemap,
include: params.include,
exclude: params.exclude,
depth: params.depth,
extraHops: params.extraHops,
};

for (const seed of seeds) {
const newSeed = typeof seed === "string" ? { url: seed } : seed;
newSeed.url = removeQuotes(newSeed.url);

try {
scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed }));
// eslint-disable-next-line @typescript-eslint/no-explicit-any
} catch (e: any) {
logger.error("Failed to create seed", {
error: e.toString(),
...scopeOpts,
...newSeed,
});
if (params.failOnFailedSeed) {
logger.fatal(
"Invalid seed specified, aborting crawl",
{ url: newSeed.url },
"general",
1,
);
}
}
}

if (!params.qaSource && !scopedSeeds.length) {
logger.fatal("No valid seeds specified, aborting crawl");
}

return scopedSeeds;
}

export function rxEscape(string: string) {
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&");
}
Expand Down
38 changes: 38 additions & 0 deletions src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ export type SaveState = {
errors: string[];
extraSeeds: string[];
sitemapDone: boolean;
seedFileDone?: boolean;
seedFileSeeds?: string[];
};

// ============================================================================
Expand All @@ -206,6 +208,10 @@ export class RedisCrawlState {

sitemapDoneKey: string;

seedFileDoneKey: string;
seedFileSeedsKey: string;
seedFileSeedsMap: string;

waczFilename: string | null = null;

constructor(
Expand Down Expand Up @@ -241,6 +247,10 @@ export class RedisCrawlState {

this.sitemapDoneKey = this.key + ":sitemapDone";

this.seedFileDoneKey = this.key + ":sfDone";
this.seedFileSeedsKey = this.key + "sfSeeds";
this.seedFileSeedsMap = this.key + ":sfMap";

this._initLuaCommands(this.redis);
}

Expand Down Expand Up @@ -733,8 +743,10 @@ return inx;
const pending = await this.getPendingList();
const failed = await this._iterListKeys(this.fkey, seen);
const errors = await this.getErrorList();
const seedFileSeeds = await this.getSeedFileSeeds();
const extraSeeds = await this._iterListKeys(this.esKey, seen);
const sitemapDone = await this.isSitemapDone();
const seedFileDone = await this.isSeedFileDone();

const finished = [...seen.values()];

Expand All @@ -744,6 +756,8 @@ return inx;
queued,
pending,
sitemapDone,
seedFileDone,
seedFileSeeds,
failed,
errors,
};
Expand Down Expand Up @@ -888,6 +902,10 @@ return inx;
if (state.sitemapDone) {
await this.markSitemapDone();
}

if (state.seedFileDone) {
await this.markSeedFileDone();
}
}

// backwards compatibility: not using done, instead 'finished'
Expand Down Expand Up @@ -1032,6 +1050,14 @@ return inx;
return await this.redis.lpush(this.pageskey, JSON.stringify(data));
}

async addSeedFileSeed(url: string) {
const ret = await this.redis.sadd(this.seedFileSeedsMap, url);
if (ret > 0) {
// Push to end of list to keep seeds in order for ids
await this.redis.rpush(this.seedFileSeedsKey, url);
}
}

// add extra seeds from redirect
async addExtraSeed(
seeds: ScopedSeed[],
Expand Down Expand Up @@ -1085,6 +1111,10 @@ return inx;
return seeds[newSeedId];
}

async getSeedFileSeeds() {
return await this.redis.lrange(this.seedFileSeedsKey, 0, -1);
}

async getExtraSeeds() {
const seeds: ExtraRedirectSeed[] = [];
const res = await this.redis.lrange(this.esKey, 0, -1);
Expand All @@ -1106,4 +1136,12 @@ return inx;
result.modified = this._timestamp();
await this.redis.set(`${this.key}:profileUploaded`, JSON.stringify(result));
}

async isSeedFileDone() {
return (await this.redis.get(this.seedFileDoneKey)) == "1";
}

async markSeedFileDone() {
await this.redis.set(this.seedFileDoneKey, "1");
}
}
2 changes: 1 addition & 1 deletion tests/scopes.test.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { parseArgs } from "../dist/util/argParser.js";
import { parseSeeds } from "../dist/util/seeds.js";
import { parseSeeds } from "../dist/util/parseseeds.js";

import fs from "fs";

Expand Down
Loading
Loading