11import util from "util" ;
2- import { spawn , exec as execCallback } from "child_process" ;
2+ import { spawn , execSync , exec as execCallback } from "child_process" ;
33import fs from "fs" ;
4+ import path from "path" ;
5+ import yaml from "js-yaml" ;
6+ import Redis from "ioredis" ;
47
58const exec = util . promisify ( execCallback ) ;
69
10+ const pagesFile = "test-crawls/collections/seed-file-restart-test/pages/pages.jsonl" ;
11+ const extraPagesFile = "test-crawls/collections/seed-file-restart-test/pages/extraPages.jsonl" ;
12+
13+ const expectedSeedFileSeeds = [
14+ "https://old.webrecorder.net/about/" ,
15+ "https://specs.webrecorder.net/wacz/1.1.1/" ,
16+ "https://old.webrecorder.net/faq"
17+ ] ;
18+
719let proc = null ;
20+ let redisId = null ;
821
922const DOCKER_HOST_NAME = process . env . DOCKER_HOST_NAME || "host.docker.internal" ;
1023const TEST_HOST = `http://${ DOCKER_HOST_NAME } :31502` ;
@@ -20,6 +33,38 @@ afterAll(() => {
2033} ) ;
2134
2235
36+ function sleep ( ms ) {
37+ return new Promise ( ( resolve ) => setTimeout ( resolve , ms ) ) ;
38+ }
39+
40+ async function waitContainerDone ( containerId ) {
41+ // containerId is initially the full id, but docker ps
42+ // only prints the short id (first 12 characters)
43+ containerId = containerId . slice ( 0 , 12 ) ;
44+
45+ while ( true ) {
46+ try {
47+ const res = execSync ( "docker ps -q" , { encoding : "utf-8" } ) ;
48+ if ( res . indexOf ( containerId ) < 0 ) {
49+ return ;
50+ }
51+ } catch ( e ) {
52+ console . error ( e ) ;
53+ }
54+ await sleep ( 500 ) ;
55+ }
56+ }
57+
58+ async function killContainer ( containerId ) {
59+ try {
60+ execSync ( `docker kill -s SIGINT ${ containerId } ` ) ;
61+ } catch ( e ) {
62+ return ;
63+ }
64+
65+ await waitContainerDone ( containerId ) ;
66+ }
67+
2368
2469test ( "check that URLs in seed-list are crawled" , async ( ) => {
2570 try {
@@ -91,3 +136,173 @@ test("check that URLs in seed-list hosted at URL are crawled", async () => {
91136 }
92137 expect ( foundSeedUrl ) . toBe ( true ) ;
93138} ) ;
139+
140+
141+ let savedStateFile ;
142+ let finished ;
143+
144+ test ( "start crawl from seed list and then interrupt and save state when seeds have been crawled" , async ( ) => {
145+ let containerId = null ;
146+
147+ try {
148+ containerId = execSync (
149+ `docker run -d -e CRAWL_ID=seedfiletest -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-restart-test --seedFile "${ TEST_HOST } /urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1 --logging stats,debug` ,
150+ { encoding : "utf-8" } ,
151+ ) ;
152+ } catch ( error ) {
153+ console . log ( error ) ;
154+ }
155+
156+ // remove existing pagesFile to support reentrancy
157+ try {
158+ fs . unlinkSync ( pagesFile ) ;
159+ } catch ( e ) {
160+ // ignore
161+ }
162+
163+ while ( true ) {
164+ try {
165+ const pages = fs
166+ . readFileSync ( pagesFile , { encoding : "utf-8" } )
167+ . trim ( )
168+ . split ( "\n" ) ;
169+
170+ if ( pages . length >= 4 ) {
171+ break ;
172+ }
173+ } catch ( e ) {
174+ // ignore
175+ }
176+
177+ await sleep ( 500 ) ;
178+ }
179+
180+ await killContainer ( containerId ) ;
181+
182+ const savedStates = fs . readdirSync (
183+ "test-crawls/collections/seed-file-restart-test/crawls" ,
184+ ) ;
185+ expect ( savedStates . length > 0 ) . toEqual ( true ) ;
186+
187+ savedStateFile = savedStates [ savedStates . length - 1 ] ;
188+ } ) ;
189+
190+
191+ test ( "check saved state for seed file seeds" , ( ) => {
192+ expect ( savedStateFile ) . toBeTruthy ( ) ;
193+
194+ const savedState = fs . readFileSync (
195+ path . join ( "test-crawls/collections/seed-file-restart-test/crawls" , savedStateFile ) ,
196+ "utf-8" ,
197+ ) ;
198+
199+ const saved = yaml . load ( savedState ) ;
200+
201+ const state = saved . state ;
202+ finished = state . finished ;
203+
204+ const numDone = finished . length ;
205+ const numQueued = state . queued . length ;
206+
207+ expect ( ! ! state ) . toBe ( true ) ;
208+ expect ( numDone > 0 ) . toEqual ( true ) ;
209+ expect ( numQueued > 0 ) . toEqual ( true ) ;
210+
211+ const seedFileDone = state . seedFileDone ;
212+ expect ( seedFileDone ) . toEqual ( true ) ;
213+
214+ const seedFileSeeds = state . seedFileSeeds ;
215+ expect ( seedFileSeeds . length ) . toEqual ( 3 ) ;
216+ for ( const [ index , seed ] of seedFileSeeds . entries ( ) ) {
217+ expect ( seed ) . toEqual ( expectedSeedFileSeeds [ index ] ) ;
218+ }
219+ } ) ;
220+
221+
222+ test ( "check seed file seed crawl finishes successfully after resuming from saved state" , async ( ) => {
223+ let containerId = null ;
224+
225+ const port = 36383 ;
226+
227+ try {
228+ containerId = execSync (
229+ `docker run -d -p ${ port } :6379 -e CRAWL_ID=seedfiletest -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection seed-file-restart-test --debugAccessRedis --config /crawls/collections/seed-file-restart-test/crawls/${ savedStateFile } --seedFile "${ TEST_HOST } /urlSeedFile.txt" --limit 10 --behaviors "" --exclude community --scopeType page --extraHops 1 --logging stats,debug` ,
230+ { encoding : "utf-8" } ,
231+ ) ;
232+ } catch ( error ) {
233+ console . log ( error ) ;
234+ }
235+
236+ await sleep ( 2000 ) ;
237+
238+ const redis = new Redis ( `redis://127.0.0.1:${ port } /0` , { lazyConnect : true , retryStrategy : ( ) => null } ) ;
239+
240+ try {
241+ await redis . connect ( {
242+ maxRetriesPerRequest : 100 ,
243+ } ) ;
244+
245+ await sleep ( 2000 ) ;
246+
247+ for ( const url of finished ) {
248+ const res = await redis . sismember ( "seedfiletest:s" , url ) ;
249+ expect ( res ) . toBe ( 1 ) ;
250+ }
251+ } catch ( e ) {
252+ console . log ( e ) ;
253+ } finally {
254+ await waitContainerDone ( containerId ) ;
255+ }
256+ } ) ;
257+
258+ test ( "ensure all pages were crawled" , async ( ) => {
259+ const pages = fs
260+ . readFileSync ( pagesFile , { encoding : "utf-8" } )
261+ . trim ( )
262+ . split ( "\n" ) ;
263+
264+ // first line is the header
265+ expect ( pages . length ) . toBe ( 4 ) ;
266+
267+ const extraPages = fs
268+ . readFileSync ( extraPagesFile , { encoding : "utf-8" } )
269+ . trim ( )
270+ . split ( "\n" ) ;
271+
272+ // first line is the header
273+ expect ( extraPages . length ) . toBe ( 8 ) ;
274+ } )
275+
276+
277+ test ( "ensure that seed file seeds were pulled from Redis on restart" , async ( ) => {
278+ const logDir = "test-crawls/collections/seed-file-restart-test/logs/" ;
279+ const logFiles = [ ] ;
280+ fs . readdirSync ( logDir ) . forEach ( ( file ) => {
281+ if ( file . endsWith ( ".log" ) ) {
282+ logFiles . push ( path . join ( logDir , file ) ) ;
283+ }
284+ } ) ;
285+
286+ expect ( logFiles . length ) . toBeGreaterThan ( 0 ) ;
287+
288+ const logFile = logFiles [ logFiles . length - 1 ] ;
289+ const log = fs . readFileSync ( logFile , { encoding : "utf-8" } ) . trim ( ) ;
290+
291+ expect (
292+ log . indexOf (
293+ '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/about/"}' ,
294+ ) > 0 ,
295+ ) . toBe ( true ) ;
296+
297+ expect (
298+ log . indexOf (
299+ '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}' ,
300+ ) > 0 ,
301+ ) . toBe ( true ) ;
302+
303+ expect (
304+ log . indexOf (
305+ '"logLevel":"debug","context":"seedFile","message":"Pulled seed file seed from Redis","details":{"url":"https://old.webrecorder.net/faq"}' ,
306+ ) > 0 ,
307+ ) . toBe ( true ) ;
308+ } ) ;
0 commit comments