Skip to content

Commit 9ddca55

Browse files
authored
Don't delete seed files if referenced by crawls (#2816)
Fixes #2812
1 parent 2be553d commit 9ddca55

File tree

4 files changed

+100
-13
lines changed

4 files changed

+100
-13
lines changed

backend/btrixcloud/file_uploads.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class FileUploadOps:
5252
def __init__(self, mdb, org_ops, storage_ops):
5353
self.files = mdb["file_uploads"]
5454
self.crawl_configs = mdb["crawl_configs"]
55+
self.crawls = mdb["crawls"]
5556

5657
self.org_ops = org_ops
5758
self.storage_ops = storage_ops
@@ -325,6 +326,10 @@ async def delete_seed_file(
325326
if matching_workflow:
326327
raise HTTPException(status_code=400, detail="seed_file_in_use")
327328

329+
matching_crawl = await self.crawls.find_one({"config.seedFileId": file_id})
330+
if matching_crawl:
331+
raise HTTPException(status_code=400, detail="seed_file_in_use")
332+
328333
await self.storage_ops.delete_file_object(org, file)
329334
await self.files.delete_one({"_id": file_id, "oid": org.id})
330335
if file.type == "seedFile":
@@ -368,6 +373,12 @@ async def cleanup_unused_seed_files(self):
368373
if first_matching_workflow:
369374
continue
370375

376+
first_matching_crawl = await self.crawls.find_one(
377+
{"config.seedFileId": file_id}
378+
)
379+
if first_matching_crawl:
380+
continue
381+
371382
try:
372383
org = await self.org_ops.get_org_by_id(file_dict["oid"])
373384
await self.delete_seed_file(file_id, org)

backend/test/test_crawlconfigs.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
_coll_id = None
1414
_admin_crawl_cid = None
1515

16-
_seed_file_id = None
17-
1816

1917
def test_crawl_config_usernames(
2018
crawler_auth_headers, default_org_id, crawler_config_id
@@ -978,7 +976,7 @@ def test_add_crawl_config_with_seed_file(
978976
assert data["config"]["seeds"] is None
979977

980978

981-
def test_delete_in_use_seed_file(
979+
def test_delete_seed_file_in_use_crawlconfig(
982980
crawler_auth_headers, default_org_id, seed_file_id, seed_file_config_id
983981
):
984982
# Attempt to delete in-use seed file, verify we get 400 response

backend/test/test_run_crawl.py

Lines changed: 84 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
# (not using the fixture to be able to test running crawl)
2828
admin_crawl_id = None
2929

30+
seed_file_crawl_id = None
31+
3032

3133
def test_list_orgs(admin_auth_headers, default_org_id):
3234
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
@@ -1377,12 +1379,14 @@ def test_seed_file_crawl(
13771379
headers=crawler_auth_headers,
13781380
)
13791381
assert r.status_code == 200
1380-
crawl_id = r.json()["started"]
1382+
1383+
global seed_file_crawl_id
1384+
seed_file_crawl_id = r.json()["started"]
13811385

13821386
# Wait for it to complete
13831387
while True:
13841388
r = requests.get(
1385-
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
1389+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{seed_file_crawl_id}/replay.json",
13861390
headers=crawler_auth_headers,
13871391
)
13881392
data = r.json()
@@ -1394,7 +1398,7 @@ def test_seed_file_crawl(
13941398

13951399
# Check on crawl
13961400
r = requests.get(
1397-
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
1401+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{seed_file_crawl_id}/replay.json",
13981402
headers=crawler_auth_headers,
13991403
)
14001404
assert r.status_code == 200
@@ -1405,7 +1409,7 @@ def test_seed_file_crawl(
14051409

14061410
# Validate crawl pages
14071411
r = requests.get(
1408-
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/pages",
1412+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{seed_file_crawl_id}/pages",
14091413
headers=crawler_auth_headers,
14101414
)
14111415
assert r.status_code == 200
@@ -1416,3 +1420,79 @@ def test_seed_file_crawl(
14161420
"https://specs.webrecorder.net/",
14171421
"https://webrecorder.net/",
14181422
)
1423+
1424+
1425+
def test_delete_seed_file_in_use_crawl(
1426+
crawler_auth_headers, default_org_id, seed_file_id, seed_file_config_id
1427+
):
1428+
# Remove seed file from workflow
1429+
r = requests.patch(
1430+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{seed_file_config_id}/",
1431+
headers=crawler_auth_headers,
1432+
json={
1433+
"config": {
1434+
"seeds": [{"url": "https://webrecorder.net"}],
1435+
"scopeType": "page",
1436+
"limit": 1,
1437+
"seedFileId": None,
1438+
}
1439+
},
1440+
)
1441+
assert r.status_code == 200
1442+
1443+
data = r.json()
1444+
assert data["updated"]
1445+
assert data["metadata_changed"] == False
1446+
assert data["settings_changed"] == True
1447+
1448+
# Verify seed file was removed
1449+
r = requests.get(
1450+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{seed_file_config_id}",
1451+
headers=crawler_auth_headers,
1452+
)
1453+
assert r.status_code == 200
1454+
data = r.json()
1455+
assert data["config"]["seedFileId"] is None
1456+
1457+
# Attempt to delete seed file, ensure we get 400 response
1458+
r = requests.delete(
1459+
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
1460+
headers=crawler_auth_headers,
1461+
)
1462+
assert r.status_code == 400
1463+
assert r.json()["detail"] == "seed_file_in_use"
1464+
1465+
r = requests.get(
1466+
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
1467+
headers=crawler_auth_headers,
1468+
)
1469+
assert r.status_code == 200
1470+
assert r.json()["id"] == seed_file_id
1471+
1472+
1473+
def test_delete_seed_file_not_in_use(
1474+
crawler_auth_headers, default_org_id, seed_file_id, seed_file_config_id
1475+
):
1476+
# Delete crawl with seed file id so it's no longer in use
1477+
r = requests.post(
1478+
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
1479+
headers=crawler_auth_headers,
1480+
json={"crawl_ids": [seed_file_crawl_id]},
1481+
)
1482+
assert r.status_code == 200
1483+
data = r.json()
1484+
assert data["deleted"] == 1
1485+
1486+
# Delete seed file
1487+
r = requests.delete(
1488+
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
1489+
headers=crawler_auth_headers,
1490+
)
1491+
assert r.status_code == 200
1492+
assert r.json()["success"]
1493+
1494+
r = requests.get(
1495+
f"{API_PREFIX}/orgs/{default_org_id}/files/{seed_file_id}",
1496+
headers=crawler_auth_headers,
1497+
)
1498+
assert r.status_code == 404

backend/test/test_uploads.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -607,7 +607,7 @@ def test_get_all_crawls_by_type(
607607
)
608608
assert r.status_code == 200
609609
data = r.json()
610-
assert data["total"] == 7
610+
assert data["total"] == 6
611611
for item in data["items"]:
612612
assert item["type"] == "crawl"
613613

@@ -639,7 +639,7 @@ def test_get_all_crawls_by_user(
639639
)
640640
assert r.status_code == 200
641641
data = r.json()
642-
assert data["total"] == 6
642+
assert data["total"] == 5
643643
for item in data["items"]:
644644
assert item["userid"] == crawler_userid
645645

@@ -823,15 +823,14 @@ def test_all_crawls_search_values(
823823
assert r.status_code == 200
824824
data = r.json()
825825

826-
assert len(data["names"]) == 9
826+
assert len(data["names"]) == 8
827827
expected_names = [
828828
"Crawler User Test Crawl",
829829
"Custom Behavior Logs",
830830
"My Upload Updated",
831831
"test2.wacz",
832832
"All Crawls Test Crawl",
833833
"Crawler User Crawl for Testing QA",
834-
"Seed File Test Crawl",
835834
]
836835
for expected_name in expected_names:
837836
assert expected_name in data["names"]
@@ -851,14 +850,13 @@ def test_all_crawls_search_values(
851850
assert r.status_code == 200
852851
data = r.json()
853852

854-
assert len(data["names"]) == 6
853+
assert len(data["names"]) == 5
855854
expected_names = [
856855
"Admin Test Crawl",
857856
"All Crawls Test Crawl",
858857
"Crawler User Crawl for Testing QA",
859858
"Crawler User Test Crawl",
860859
"Custom Behavior Logs",
861-
"Seed File Test Crawl",
862860
]
863861
for expected_name in expected_names:
864862
assert expected_name in data["names"]

0 commit comments

Comments
 (0)