Skip to content

Commit 8726851

Browse files
authored
profile constraint follow-up: (#3009)
- follow-up to #2988, alternative to #3008 - clear proxyId in workflow if using profile, ensure it is always empty for new workflows/updated workflows that have a profile - add migration to clear proxyId if profileid is set for existing workflows - avoids having to update proxyId in workflows when it changes in the profile - fix assertion when updating proxy to return 400 if proxy is invalid - fix switching from one profile to another, add test to ensure works as expected - ci: add ci action to clear out disk space to reduce chance of failures
1 parent 1750557 commit 8726851

File tree

9 files changed

+144
-25
lines changed

9 files changed

+144
-25
lines changed

.github/workflows/k3d-ci.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,13 @@ jobs:
4545
needs: paths-filter
4646
if: needs.paths-filter.outputs.matches == 'true'
4747
steps:
48+
- name: Initial Disk Cleanup
49+
uses: mathio/gha-cleanup@v1
50+
with:
51+
remove-browsers: true
52+
verbose: true
53+
54+
4855
- name: Create k3d Cluster
4956
uses: AbsaOSS/k3d-action@v2
5057
with:

backend/btrixcloud/basecrawls.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -464,9 +464,11 @@ async def _resolve_crawl_refs(
464464
raise HTTPException(status_code=400, detail="missing_org")
465465

466466
if hasattr(crawl, "profileid") and crawl.profileid:
467-
crawl.profileName = await self.crawl_configs.profiles.get_profile_name(
467+
profile = await self.crawl_configs.profiles.get_profile(
468468
crawl.profileid, org
469469
)
470+
if profile:
471+
crawl.profileName = profile.name
470472

471473
if (
472474
files

backend/btrixcloud/crawlconfigs.py

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -265,23 +265,18 @@ async def add_crawl_config(
265265
proxy_id = config_in.proxyId
266266

267267
profileid = None
268+
# ensure profile is valid, get proxy_id from profile
268269
if isinstance(config_in.profileid, UUID):
269270
profileid = config_in.profileid
270-
271-
# ensure profile is valid, get proxy_id from profile
272-
if profileid:
273-
profile = await self.profiles.get_profile(profileid, org)
274-
proxy_id = profile.proxyId
271+
proxy_id = None
275272
else:
276273
if config_in.config and config_in.config.failOnContentCheck:
277274
raise HTTPException(
278275
status_code=400, detail="fail_on_content_check_requires_profile"
279276
)
280277

281278
# ensure proxy_id is valid and available for org
282-
if proxy_id:
283-
if not self.can_org_use_proxy(org, proxy_id):
284-
raise HTTPException(status_code=404, detail="proxy_not_found")
279+
self.assert_can_org_use_proxy(org, proxy_id)
285280

286281
if config_in.config.exclude:
287282
exclude = config_in.config.exclude
@@ -599,10 +594,20 @@ async def update_crawl_config(
599594
changed = changed or (
600595
update.profileid is not None
601596
and update.profileid != orig_crawl_config.profileid
602-
and ((not update.profileid) != (not orig_crawl_config.profileid))
597+
and not (update.profileid == "" and not orig_crawl_config.profileid)
598+
)
599+
600+
# either unsetting profile or no profile set on current config
601+
no_profile = update.profileid == "" or (
602+
update.profileid is None and not orig_crawl_config.profileid
603603
)
604604

605-
changed = changed or (orig_crawl_config.proxyId != update.proxyId)
605+
changed = changed or (
606+
no_profile
607+
and update.proxyId is not None
608+
and orig_crawl_config.proxyId != update.proxyId
609+
and not (update.proxyId == "" and not orig_crawl_config.proxyId)
610+
)
606611

607612
metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name")
608613
metadata_changed = metadata_changed or self.check_attr_changed(
@@ -633,8 +638,6 @@ async def update_crawl_config(
633638
last_rev = ConfigRevision(**orig_dict)
634639
last_rev = await self.config_revs.insert_one(last_rev.to_dict())
635640

636-
proxy_id = update.proxyId
637-
638641
# set update query
639642
query = update.dict(exclude_unset=True)
640643
query["modifiedBy"] = user.id
@@ -647,14 +650,15 @@ async def update_crawl_config(
647650
# else, ensure its a valid profile
648651
elif update.profileid:
649652
profile = await self.profiles.get_profile(cast(UUID, update.profileid), org)
653+
self.assert_can_org_use_proxy(org, profile.proxyId)
650654
query["profileid"] = update.profileid
651-
proxy_id = profile.proxyId
652-
# don't change the proxy if profile is set, as it should match the profile proxy
653-
elif orig_crawl_config.profileid:
654-
proxy_id = None
655655

656-
if proxy_id is not None:
657-
query["proxyId"] = proxy_id
656+
if no_profile:
657+
if update.proxyId == "":
658+
query["proxyId"] = None
659+
elif update.proxyId:
660+
self.assert_can_org_use_proxy(org, update.proxyId)
661+
query["proxyId"] = update.proxyId
658662

659663
if update.config is not None:
660664
query["config"] = update.config.dict()
@@ -1025,9 +1029,10 @@ async def get_crawl_config_out(self, cid: UUID, org: Organization):
10251029
await self._add_running_curr_crawl_stats(crawlconfig)
10261030

10271031
if crawlconfig.profileid:
1028-
crawlconfig.profileName = await self.profiles.get_profile_name(
1029-
crawlconfig.profileid, org
1030-
)
1032+
profile = await self.profiles.get_profile(crawlconfig.profileid, org)
1033+
if profile:
1034+
crawlconfig.profileName = profile.name
1035+
crawlconfig.proxyId = profile.proxyId
10311036

10321037
crawlconfig.config.seeds = None
10331038

@@ -1241,8 +1246,7 @@ async def run_now_internal(
12411246
else:
12421247
profile_filename = ""
12431248

1244-
if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId):
1245-
raise HTTPException(status_code=404, detail="proxy_not_found")
1249+
self.assert_can_org_use_proxy(org, crawlconfig.proxyId)
12461250

12471251
storage_filename = (
12481252
crawlconfig.crawlFilenameTemplate or self.default_filename_template
@@ -1418,6 +1422,11 @@ def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> boo
14181422
_proxy.shared and org.allowSharedProxies
14191423
) or _proxy.id in org.allowedProxies
14201424

1425+
def assert_can_org_use_proxy(self, org: Organization, proxy: Optional[str]):
1426+
"""assert that proxy can be used or throw error"""
1427+
if proxy and not self.can_org_use_proxy(org, proxy):
1428+
raise HTTPException(status_code=400, detail="proxy_not_found")
1429+
14211430
def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
14221431
"""Generate WARC prefix slug from org slug, name or url
14231432
if no name is provided, hostname is used from url, otherwise

backend/btrixcloud/db.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
) = PageOps = BackgroundJobOps = FileUploadOps = CrawlLogOps = CrawlManager = object
3636

3737

38-
CURR_DB_VERSION = "0052"
38+
CURR_DB_VERSION = "0054"
3939

4040

4141
# ============================================================================
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
Migration 0054 -- clear proxyId on workflows that have profile set
3+
using proxyId from profile always
4+
"""
5+
6+
from btrixcloud.migrations import BaseMigration
7+
8+
9+
MIGRATION_VERSION = "0054"
10+
11+
12+
class Migration(BaseMigration):
13+
"""Migration class."""
14+
15+
# pylint: disable=unused-argument
16+
def __init__(self, mdb, **kwargs):
17+
super().__init__(mdb, migration_version=MIGRATION_VERSION)
18+
19+
async def migrate_up(self):
20+
"""Perform migration up.
21+
22+
Unset proxyId on workflows that have a profileid set
23+
"""
24+
crawl_configs = self.mdb["crawl_configs"]
25+
26+
try:
27+
await crawl_configs.update_many(
28+
{"profileid": {"$ne": None}, "proxyId": {"$ne": None}},
29+
{"$set": {"proxyId": None}},
30+
)
31+
# pylint: disable=broad-exception-caught
32+
except Exception as err:
33+
print(
34+
f"Error update crawl_configs: {err}",
35+
flush=True,
36+
)

backend/btrixcloud/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2428,6 +2428,8 @@ class ProfileFile(BaseFile):
24282428
class Profile(BaseMongoModel):
24292429
"""Browser profile"""
24302430

2431+
id: UUID
2432+
24312433
name: str
24322434
description: Optional[str] = ""
24332435

backend/btrixcloud/orgs.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1300,6 +1300,10 @@ async def import_org(
13001300
if not workflow.get("crawlerChannel"):
13011301
workflow["crawlerChannel"] = "default"
13021302

1303+
# Ensure proxyId is unset if profile is set
1304+
if workflow.get("profileid"):
1305+
workflow["proxyId"] = None
1306+
13031307
crawl_config = CrawlConfig.from_dict(workflow)
13041308
await self.crawl_configs_db.insert_one(crawl_config.to_dict())
13051309

backend/test/test_crawlconfigs.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,6 +1074,59 @@ def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id):
10741074
assert page["url"]
10751075

10761076

1077+
def test_update_profile(crawler_auth_headers, default_org_id, profile_id, profile_2_id):
1078+
1079+
def get_profile():
1080+
r = requests.get(
1081+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
1082+
headers=crawler_auth_headers,
1083+
)
1084+
assert r.status_code == 200
1085+
return r.json()["profileid"]
1086+
1087+
def update_profile(profileid):
1088+
r = requests.patch(
1089+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
1090+
headers=crawler_auth_headers,
1091+
json={"profileid": profileid},
1092+
)
1093+
assert r.status_code == 200
1094+
return r.json()
1095+
1096+
# No profile to start
1097+
assert get_profile() == None
1098+
1099+
# Add profile
1100+
data = update_profile(profile_2_id)
1101+
assert data["settings_changed"] == True
1102+
assert data["metadata_changed"] == False
1103+
assert get_profile() == profile_2_id
1104+
1105+
# Same profile
1106+
data = update_profile(profile_2_id)
1107+
assert data["settings_changed"] == False
1108+
assert data["metadata_changed"] == False
1109+
assert get_profile() == profile_2_id
1110+
1111+
# Add different profile
1112+
data = update_profile(profile_id)
1113+
assert data["settings_changed"] == True
1114+
assert data["metadata_changed"] == False
1115+
assert get_profile() == profile_id
1116+
1117+
# Remove profile
1118+
data = update_profile("")
1119+
assert data["settings_changed"] == True
1120+
assert data["metadata_changed"] == False
1121+
assert get_profile() == None
1122+
1123+
# No change
1124+
data = update_profile("")
1125+
assert data["settings_changed"] == False
1126+
assert data["metadata_changed"] == False
1127+
assert get_profile() == None
1128+
1129+
10771130
def test_add_crawl_config_fail_on_content_check_no_profile(
10781131
crawler_auth_headers, default_org_id, sample_crawl_data
10791132
):

chart/test/test.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ operator_resync_seconds: 3
1616

1717
qa_scale: 2
1818

19+
# lower storage sizes
20+
redis_storage: "100Mi"
21+
profile_browser_workdir_size: "100Mi"
22+
crawler_storage: "500Mi"
23+
24+
1925
# for testing only
2026
crawler_extra_cpu_per_browser: 300m
2127

0 commit comments

Comments
 (0)