Skip to content

Commit df395a7

Browse files
committed
Simplify to only storing seed file seed URLs and add tests
The only information in seed files is the URL, so no need to complicate things by storing more than that.
1 parent 17b6b23 commit df395a7

File tree

3 files changed

+229
-21
lines changed

3 files changed

+229
-21
lines changed

src/util/parseseeds.ts

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,8 @@ export async function parseSeeds(
3030
seedFileDone &&
3131
crawlState
3232
) {
33-
for (const seed of params.state.seedFileSeeds) {
34-
const scopedSeed: ScopedSeed = JSON.parse(seed);
35-
await crawlState.addSeedFileSeed(scopedSeed);
33+
for (const seedUrl of params.state.seedFileSeeds) {
34+
await crawlState.addSeedFileSeed(seedUrl);
3635
}
3736
}
3837

@@ -76,7 +75,7 @@ export async function parseSeeds(
7675
const scopedSeed = new ScopedSeed({ ...scopeOpts, ...newSeed });
7776
scopedSeeds.push(scopedSeed);
7877
if (params.seedFile && !seedFileDone && crawlState) {
79-
await crawlState.addSeedFileSeed(scopedSeed);
78+
await crawlState.addSeedFileSeed(scopedSeed.url);
8079
logger.debug(
8180
"Pushed seed file seed to Redis",
8281
{ url: scopedSeed.url },
@@ -103,21 +102,21 @@ export async function parseSeeds(
103102

104103
// If seed file was already successfully parsed, re-add seeds from Redis
105104
if (params.seedFile && seedFileDone && crawlState) {
106-
const seedFileScopedSeeds = await crawlState.getSeedFileSeeds();
107-
for (const seed of seedFileScopedSeeds) {
105+
const seedFileSeedUrls = await crawlState.getSeedFileSeeds();
106+
for (const seedUrl of seedFileSeedUrls) {
108107
logger.debug(
109108
"Pulled seed file seed from Redis",
110-
{ url: seed.url },
109+
{ url: seedUrl },
111110
"seedFile",
112111
);
113112
try {
114-
const scopedSeed = new ScopedSeed({ ...scopeOpts, url: seed.url });
113+
const scopedSeed = new ScopedSeed({ ...scopeOpts, url: seedUrl });
115114
scopedSeeds.push(scopedSeed);
116115
// eslint-disable-next-line @typescript-eslint/no-explicit-any
117116
} catch (e: any) {
118117
logger.error("Failed to create seed from Redis", {
119118
error: e.toString(),
120-
...seed,
119+
url: seedUrl,
121120
});
122121
}
123122
}

src/util/state.ts

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -743,7 +743,7 @@ return inx;
743743
const pending = await this.getPendingList();
744744
const failed = await this._iterListKeys(this.fkey, seen);
745745
const errors = await this.getErrorList();
746-
const seedFileSeeds = await this._iterListKeys(this.seedFileSeedsKey, seen);
746+
const seedFileSeeds = await this.getSeedFileSeeds();
747747
const extraSeeds = await this._iterListKeys(this.esKey, seen);
748748
const sitemapDone = await this.isSitemapDone();
749749
const seedFileDone = await this.isSeedFileDone();
@@ -1050,11 +1050,11 @@ return inx;
10501050
return await this.redis.lpush(this.pageskey, JSON.stringify(data));
10511051
}
10521052

1053-
async addSeedFileSeed(seed: ScopedSeed) {
1054-
const ret = await this.redis.sadd(this.seedFileSeedsMap, seed.url);
1053+
async addSeedFileSeed(url: string) {
1054+
const ret = await this.redis.sadd(this.seedFileSeedsMap, url);
10551055
if (ret > 0) {
10561056
// Push to end of list to keep seeds in order for ids
1057-
await this.redis.rpush(this.seedFileSeedsKey, JSON.stringify(seed));
1057+
await this.redis.rpush(this.seedFileSeedsKey, url);
10581058
}
10591059
}
10601060

@@ -1112,13 +1112,7 @@ return inx;
11121112
}
11131113

11141114
async getSeedFileSeeds() {
1115-
const seeds: ScopedSeed[] = [];
1116-
1117-
const res = await this.redis.lrange(this.seedFileSeedsKey, 0, -1);
1118-
for (const key of res) {
1119-
seeds.push(JSON.parse(key));
1120-
}
1121-
return seeds;
1115+
return await this.redis.lrange(this.seedFileSeedsKey, 0, -1);
11221116
}
11231117

11241118
async getExtraSeeds() {

tests/url_file_list.test.js

Lines changed: 216 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,23 @@
11
import util from "util";
2-
import { spawn, exec as execCallback } from "child_process";
2+
import { spawn, execSync, exec as execCallback } from "child_process";
33
import fs from "fs";
4+
import path from "path";
5+
import yaml from "js-yaml";
6+
import Redis from "ioredis";
47

58
const exec = util.promisify(execCallback);
69

10+
const pagesFile = "test-crawls/collections/seed-file-restart-test/pages/pages.jsonl";
11+
const extraPagesFile = "test-crawls/collections/seed-file-restart-test/pages/extraPages.jsonl";
12+
13+
const expectedSeedFileSeeds = [
14+
"https://old.webrecorder.net/about/",
15+
"https://specs.webrecorder.net/wacz/1.1.1/",
16+
"https://old.webrecorder.net/faq"
17+
];
18+
719
let proc = null;
20+
let redisId = null;
821

922
const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";
1023
const TEST_HOST = `http://${DOCKER_HOST_NAME}:31502`;
@@ -20,6 +33,38 @@ afterAll(() => {
2033
});
2134

2235

36+
function sleep(ms) {
37+
return new Promise((resolve) => setTimeout(resolve, ms));
38+
}
39+
40+
async function waitContainerDone(containerId) {
41+
// containerId is initially the full id, but docker ps
42+
// only prints the short id (first 12 characters)
43+
containerId = containerId.slice(0, 12);
44+
45+
while (true) {
46+
try {
47+
const res = execSync("docker ps -q", { encoding: "utf-8" });
48+
if (res.indexOf(containerId) < 0) {
49+
return;
50+
}
51+
} catch (e) {
52+
console.error(e);
53+
}
54+
await sleep(500);
55+
}
56+
}
57+
58+
async function killContainer(containerId) {
59+
try {
60+
execSync(`docker kill -s SIGINT ${containerId}`);
61+
} catch (e) {
62+
return;
63+
}
64+
65+
await waitContainerDone(containerId);
66+
}
67+
2368

2469
test("check that URLs in seed-list are crawled", async () => {
2570
try {
@@ -91,3 +136,173 @@ test("check that URLs in seed-list hosted at URL are crawled", async () => {
91136
}
92137
expect(foundSeedUrl).toBe(true);
93138
});
139+
140+
141+
let savedStateFile;
142+
let finished;
143+
144+
test("start crawl from seed list and then interrupt and save state when seeds have been crawled", async () => {
145+
let containerId = null;
146+
147+
try {
148+
containerId = execSync(
149+
`docker run -d -e CRAWL_ID=seedfiletest -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-restart-test --seedFile "${TEST_HOST}/urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1 --logging stats,debug`,
150+
{ encoding: "utf-8" },
151+
);
152+
} catch (error) {
153+
console.log(error);
154+
}
155+
156+
// remove existing pagesFile to support reentrancy
157+
try {
158+
fs.unlinkSync(pagesFile);
159+
} catch (e) {
160+
// ignore
161+
}
162+
163+
while (true) {
164+
try {
165+
const pages = fs
166+
.readFileSync(pagesFile, { encoding: "utf-8" })
167+
.trim()
168+
.split("\n");
169+
170+
if (pages.length >= 4) {
171+
break;
172+
}
173+
} catch (e) {
174+
// ignore
175+
}
176+
177+
await sleep(500);
178+
}
179+
180+
await killContainer(containerId);
181+
182+
const savedStates = fs.readdirSync(
183+
"test-crawls/collections/seed-file-restart-test/crawls",
184+
);
185+
expect(savedStates.length > 0).toEqual(true);
186+
187+
savedStateFile = savedStates[savedStates.length - 1];
188+
});
189+
190+
191+
test("check saved state for seed file seeds", () => {
192+
expect(savedStateFile).toBeTruthy();
193+
194+
const savedState = fs.readFileSync(
195+
path.join("test-crawls/collections/seed-file-restart-test/crawls", savedStateFile),
196+
"utf-8",
197+
);
198+
199+
const saved = yaml.load(savedState);
200+
201+
const state = saved.state;
202+
finished = state.finished;
203+
204+
const numDone = finished.length;
205+
const numQueued = state.queued.length;
206+
207+
expect(!!state).toBe(true);
208+
expect(numDone > 0).toEqual(true);
209+
expect(numQueued > 0).toEqual(true);
210+
211+
const seedFileDone = state.seedFileDone;
212+
expect(seedFileDone).toEqual(true);
213+
214+
const seedFileSeeds = state.seedFileSeeds;
215+
expect(seedFileSeeds.length).toEqual(3);
216+
for (const [index, seed] of seedFileSeeds.entries()) {
217+
expect(seed).toEqual(expectedSeedFileSeeds[index]);
218+
}
219+
});
220+
221+
222+
test("check seed file seed crawl finishes successfully after resuming from saved state", async () => {
223+
let containerId = null;
224+
225+
const port = 36383;
226+
227+
try {
228+
containerId = execSync(
229+
`docker run -d -p ${port}:6379 -e CRAWL_ID=seedfiletest -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-restart-test --debugAccessRedis --config /crawls/collections/seed-file-restart-test/crawls/${savedStateFile} --seedFile "${TEST_HOST}/urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1 --logging stats,debug`,
230+
{ encoding: "utf-8" },
231+
);
232+
} catch (error) {
233+
console.log(error);
234+
}
235+
236+
await sleep(2000);
237+
238+
const redis = new Redis(`redis://127.0.0.1:${port}/0`, { lazyConnect: true, retryStrategy: () => null });
239+
240+
try {
241+
await redis.connect({
242+
maxRetriesPerRequest: 100,
243+
});
244+
245+
await sleep(2000);
246+
247+
for (const url of finished) {
248+
const res = await redis.sismember("seedfiletest:s", url);
249+
expect(res).toBe(1);
250+
}
251+
} catch (e) {
252+
console.log(e);
253+
} finally {
254+
await waitContainerDone(containerId);
255+
}
256+
});
257+
258+
test("ensure all pages were crawled", async () => {
259+
const pages = fs
260+
.readFileSync(pagesFile, { encoding: "utf-8" })
261+
.trim()
262+
.split("\n");
263+
264+
// first line is the header
265+
expect(pages.length).toBe(4);
266+
267+
const extraPages = fs
268+
.readFileSync(extraPagesFile, { encoding: "utf-8" })
269+
.trim()
270+
.split("\n");
271+
272+
// first line is the header
273+
expect(extraPages.length).toBe(8);
274+
})
275+
276+
277+
test("ensure that seed file seeds were pulled from Redis on restart", async () => {
278+
const logDir = "test-crawls/collections/seed-file-restart-test/logs/";
279+
const logFiles = [];
280+
fs.readdirSync(logDir).forEach((file) => {
281+
if (file.endsWith(".log")) {
282+
logFiles.push(path.join(logDir, file));
283+
}
284+
});
285+
286+
expect(logFiles.length).toBeGreaterThan(0);
287+
288+
const logFile = logFiles[logFiles.length - 1];
289+
const log = fs.readFileSync(logFile, { encoding: "utf-8" }).trim();
290+
291+
expect(
292+
log.indexOf(
293+
'"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/about/"}',
294+
) > 0,
295+
).toBe(true);
296+
297+
expect(
298+
log.indexOf(
299+
'"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}',
300+
) > 0,
301+
).toBe(true);
302+
303+
expect(
304+
log.indexOf(
305+
'"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/faq"}',
306+
) > 0,
307+
).toBe(true);
308+
});

0 commit comments

Comments
 (0)