Skip to content

Commit 1fd0141

Browse files
tw4likreymer
andauthored
Add backend validation to prevent adding failed crawls to collection (#2930)
Fixes #2915 - Check before any collection updates that crawls being added exist and are successfully finished. - Ensure collection exists before adding crawls to it Adds tests for creating and updating collections with a failed item. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
1 parent aa03965 commit 1fd0141

File tree

7 files changed

+85
-19
lines changed

7 files changed

+85
-19
lines changed

backend/btrixcloud/basecrawls.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,25 @@ async def bulk_presigned_files(
613613

614614
return resources, pages_optimized
615615

616+
async def validate_all_crawls_successful(
617+
self, crawl_ids: List[str], org: Organization
618+
):
619+
"""Validate that crawls in list exist and have a succesful state, or throw"""
620+
# convert to set to remove any duplicates
621+
crawl_id_set = set(crawl_ids)
622+
623+
count = await self.crawls.count_documents(
624+
{
625+
"_id": {"$in": list(crawl_id_set)},
626+
"oid": org.id,
627+
"state": {"$in": SUCCESSFUL_STATES},
628+
}
629+
)
630+
if count != len(crawl_id_set):
631+
raise HTTPException(
632+
status_code=400, detail="invalid_failed_or_unfinished_crawl"
633+
)
634+
616635
async def add_to_collection(
617636
self, crawl_ids: List[str], collection_id: UUID, org: Organization
618637
):

backend/btrixcloud/colls.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -120,17 +120,19 @@ async def init_index(self):
120120
[("oid", pymongo.ASCENDING), ("description", pymongo.ASCENDING)]
121121
)
122122

123-
async def add_collection(self, oid: UUID, coll_in: CollIn):
123+
async def add_collection(self, org: Organization, coll_in: CollIn):
124124
"""Add new collection"""
125125
crawl_ids = coll_in.crawlIds if coll_in.crawlIds else []
126+
await self.crawl_ops.validate_all_crawls_successful(crawl_ids, org)
127+
126128
coll_id = uuid4()
127129
created = dt_now()
128130

129131
slug = coll_in.slug or slug_from_name(coll_in.name)
130132

131133
coll = Collection(
132134
id=coll_id,
133-
oid=oid,
135+
oid=org.id,
134136
name=coll_in.name,
135137
slug=slug,
136138
description=coll_in.description,
@@ -143,7 +145,6 @@ async def add_collection(self, oid: UUID, coll_in: CollIn):
143145
)
144146
try:
145147
await self.collections.insert_one(coll.to_dict())
146-
org = await self.orgs.get_org_by_id(oid)
147148
await self.clear_org_previous_slugs_matching_slug(slug, org)
148149

149150
if crawl_ids:
@@ -228,7 +229,7 @@ async def add_crawls_to_collection(
228229
headers: Optional[dict] = None,
229230
) -> CollOut:
230231
"""Add crawls to collection"""
231-
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
232+
await self.crawl_ops.validate_all_crawls_successful(crawl_ids, org)
232233

233234
modified = dt_now()
234235
result = await self.collections.find_one_and_update(
@@ -239,6 +240,8 @@ async def add_crawls_to_collection(
239240
if not result:
240241
raise HTTPException(status_code=404, detail="collection_not_found")
241242

243+
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
244+
242245
await self.update_collection_counts_and_tags(coll_id)
243246
await self.update_collection_dates(coll_id, org.id)
244247

@@ -1018,7 +1021,7 @@ def init_collections_api(
10181021
async def add_collection(
10191022
new_coll: CollIn, org: Organization = Depends(org_crawl_dep)
10201023
):
1021-
return await colls.add_collection(org.id, new_coll)
1024+
return await colls.add_collection(org, new_coll)
10221025

10231026
@app.get(
10241027
"/orgs/{oid}/collections",

backend/test/test_collections.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1762,6 +1762,33 @@ def test_get_public_collection_slug_redirect(admin_auth_headers, default_org_id)
17621762
assert r.status_code == 404
17631763

17641764

1765+
def test_create_collection_with_failed_crawl(
1766+
admin_auth_headers, default_org_id, canceled_crawl_id
1767+
):
1768+
r = requests.post(
1769+
f"{API_PREFIX}/orgs/{default_org_id}/collections",
1770+
headers=admin_auth_headers,
1771+
json={
1772+
"crawlIds": [canceled_crawl_id],
1773+
"name": "Should get rejected",
1774+
},
1775+
)
1776+
assert r.status_code == 400
1777+
assert r.json()["detail"] == "invalid_failed_or_unfinished_crawl"
1778+
1779+
1780+
def test_add_failed_crawl_to_collection(
1781+
admin_auth_headers, default_org_id, canceled_crawl_id
1782+
):
1783+
r = requests.post(
1784+
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_second_coll_id}/add",
1785+
json={"crawlIds": [canceled_crawl_id]},
1786+
headers=admin_auth_headers,
1787+
)
1788+
assert r.status_code == 400
1789+
assert r.json()["detail"] == "invalid_failed_or_unfinished_crawl"
1790+
1791+
17651792
def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id):
17661793
# Delete second collection
17671794
r = requests.delete(

backend/test/test_crawl_config_search_values.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def test_get_search_values_1(admin_auth_headers, default_org_id):
4444
)
4545
data = r.json()
4646
assert sorted(data["names"]) == sorted(
47-
[NAME_1, "Admin Test Crawl", "crawler User Test Crawl"]
47+
[NAME_1, "Admin Test Crawl", "Canceled crawl", "crawler User Test Crawl"]
4848
)
4949
assert sorted(data["descriptions"]) == sorted(
5050
["Admin Test Crawl description", "crawler test crawl", DESCRIPTION_1]
@@ -74,7 +74,13 @@ def test_get_search_values_2(admin_auth_headers, default_org_id):
7474
)
7575
data = r.json()
7676
assert sorted(data["names"]) == sorted(
77-
[NAME_1, NAME_2, "Admin Test Crawl", "crawler User Test Crawl"]
77+
[
78+
NAME_1,
79+
NAME_2,
80+
"Admin Test Crawl",
81+
"Canceled crawl",
82+
"crawler User Test Crawl",
83+
]
7884
)
7985
assert sorted(data["descriptions"]) == sorted(
8086
[
@@ -111,7 +117,13 @@ def test_get_search_values_3(admin_auth_headers, default_org_id):
111117
)
112118
data = r.json()
113119
assert sorted(data["names"]) == sorted(
114-
[NAME_1, NAME_2, "Admin Test Crawl", "crawler User Test Crawl"]
120+
[
121+
NAME_1,
122+
NAME_2,
123+
"Admin Test Crawl",
124+
"Canceled crawl",
125+
"crawler User Test Crawl",
126+
]
115127
)
116128
assert sorted(data["descriptions"]) == sorted(
117129
[

backend/test/test_crawl_config_tags.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def test_get_config_by_tag_1(admin_auth_headers, default_org_id):
4747
headers=admin_auth_headers,
4848
)
4949
data = r.json()
50-
assert sorted(data) == ["tag-1", "tag-2", "wr-test-1", "wr-test-2"]
50+
assert sorted(data) == ["canceled", "tag-1", "tag-2", "wr-test-1", "wr-test-2"]
5151

5252

5353
def test_get_config_by_tag_counts_1(admin_auth_headers, default_org_id):
@@ -59,6 +59,7 @@ def test_get_config_by_tag_counts_1(admin_auth_headers, default_org_id):
5959
assert data == {
6060
"tags": [
6161
{"tag": "wr-test-2", "count": 2},
62+
{"tag": "canceled", "count": 1},
6263
{"tag": "tag-1", "count": 1},
6364
{"tag": "tag-2", "count": 1},
6465
{"tag": "wr-test-1", "count": 1},
@@ -91,6 +92,7 @@ def test_get_config_by_tag_2(admin_auth_headers, default_org_id):
9192
)
9293
data = r.json()
9394
assert sorted(data) == [
95+
"canceled",
9496
"tag-0",
9597
"tag-1",
9698
"tag-2",
@@ -109,6 +111,7 @@ def test_get_config_by_tag_counts_2(admin_auth_headers, default_org_id):
109111
assert data == {
110112
"tags": [
111113
{"tag": "wr-test-2", "count": 2},
114+
{"tag": "canceled", "count": 1},
112115
{"tag": "tag-0", "count": 1},
113116
{"tag": "tag-1", "count": 1},
114117
{"tag": "tag-2", "count": 1},

backend/test/test_filter_sort_results.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,8 @@ def test_ensure_crawl_and_admin_user_crawls(
102102
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
103103
headers=crawler_auth_headers,
104104
)
105-
assert len(r.json()["items"]) == 2
106-
assert r.json()["total"] == 2
105+
assert len(r.json()["items"]) == 3
106+
assert r.json()["total"] == 3
107107

108108

109109
def test_get_crawl_job_by_user(
@@ -212,9 +212,9 @@ def test_sort_crawls(
212212
headers=crawler_auth_headers,
213213
)
214214
data = r.json()
215-
assert data["total"] == 2
215+
assert data["total"] == 3
216216
items = data["items"]
217-
assert len(items) == 2
217+
assert len(items) == 3
218218

219219
last_created = None
220220
for crawl in items:
@@ -362,9 +362,9 @@ def test_sort_crawl_configs(
362362
headers=crawler_auth_headers,
363363
)
364364
data = r.json()
365-
assert data["total"] == 17
365+
assert data["total"] == 18
366366
items = data["items"]
367-
assert len(items) == 17
367+
assert len(items) == 18
368368

369369
last_created = None
370370
for config in items:

backend/test/test_uploads.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -592,7 +592,7 @@ def test_get_all_crawls_by_first_seed(
592592
)
593593
assert r.status_code == 200
594594
data = r.json()
595-
assert data["total"] == 5
595+
assert data["total"] == 6
596596
for item in data["items"]:
597597
assert item["firstSeed"] == first_seed
598598

@@ -607,7 +607,7 @@ def test_get_all_crawls_by_type(
607607
)
608608
assert r.status_code == 200
609609
data = r.json()
610-
assert data["total"] == 6
610+
assert data["total"] == 7
611611
for item in data["items"]:
612612
assert item["type"] == "crawl"
613613

@@ -823,9 +823,10 @@ def test_all_crawls_search_values(
823823
assert r.status_code == 200
824824
data = r.json()
825825

826-
assert len(data["names"]) == 8
826+
assert len(data["names"]) == 9
827827
expected_names = [
828828
"crawler User Test Crawl",
829+
"Canceled crawl",
829830
"Custom Behavior Logs",
830831
"My Upload Updated",
831832
"test2.wacz",
@@ -849,10 +850,11 @@ def test_all_crawls_search_values(
849850
assert r.status_code == 200
850851
data = r.json()
851852

852-
assert len(data["names"]) == 5
853+
assert len(data["names"]) == 6
853854
expected_names = [
854855
"Admin Test Crawl",
855856
"All Crawls Test Crawl",
857+
"Canceled crawl",
856858
"Crawler User Crawl for Testing QA",
857859
"crawler User Test Crawl",
858860
"Custom Behavior Logs",

0 commit comments

Comments
 (0)