Skip to content

Commit b00f59d

Browse files
committed
profile constraint follow-up:
- follow-up to #2988, alternative to #3008 - clear proxyId in workflow if using profile, ensure it is always empty for new workflows/updated workflows that have a profile - add migration to clear proxyId if profileid is set for existing workflows - avoids having to update proxyId in workflows when it changes in the profile - fix assertion when updating proxy to return 400 if proxy is invalid
1 parent 0119ae9 commit b00f59d

File tree

4 files changed

+71
-24
lines changed

4 files changed

+71
-24
lines changed

backend/btrixcloud/basecrawls.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -464,9 +464,11 @@ async def _resolve_crawl_refs(
464464
raise HTTPException(status_code=400, detail="missing_org")
465465

466466
if hasattr(crawl, "profileid") and crawl.profileid:
467-
crawl.profileName = await self.crawl_configs.profiles.get_profile_name(
467+
profile = await self.crawl_configs.profiles.get_profile(
468468
crawl.profileid, org
469469
)
470+
if profile:
471+
crawl.profileName = profile.name
470472

471473
if (
472474
files

backend/btrixcloud/crawlconfigs.py

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -265,13 +265,10 @@ async def add_crawl_config(
265265
proxy_id = config_in.proxyId
266266

267267
profileid = None
268+
# ensure profile is valid, get proxy_id from profile
268269
if isinstance(config_in.profileid, UUID):
269270
profileid = config_in.profileid
270-
271-
# ensure profile is valid, get proxy_id from profile
272-
if profileid:
273-
profile = await self.profiles.get_profile(profileid, org)
274-
proxy_id = profile.proxyId
271+
proxy_id = None
275272
else:
276273
if config_in.config and config_in.config.failOnContentCheck:
277274
raise HTTPException(
@@ -280,8 +277,7 @@ async def add_crawl_config(
280277

281278
# ensure proxy_id is valid and available for org
282279
if proxy_id:
283-
if not self.can_org_use_proxy(org, proxy_id):
284-
raise HTTPException(status_code=404, detail="proxy_not_found")
280+
self.assert_can_org_use_proxy(org, proxy_id)
285281

286282
if config_in.config.exclude:
287283
exclude = config_in.config.exclude
@@ -602,7 +598,15 @@ async def update_crawl_config(
602598
and ((not update.profileid) != (not orig_crawl_config.profileid))
603599
)
604600

605-
changed = changed or (orig_crawl_config.proxyId != update.proxyId)
601+
# either unsetting profile or no profile set on current config
602+
no_profile = update.profileid == "" or not orig_crawl_config.profileid
603+
604+
changed = changed or (
605+
no_profile
606+
and update.proxyId is not None
607+
and orig_crawl_config.proxyId != update.proxyId
608+
and ((not update.proxyId) != (not orig_crawl_config.proxyId))
609+
)
606610

607611
metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name")
608612
metadata_changed = metadata_changed or self.check_attr_changed(
@@ -633,8 +637,6 @@ async def update_crawl_config(
633637
last_rev = ConfigRevision(**orig_dict)
634638
last_rev = await self.config_revs.insert_one(last_rev.to_dict())
635639

636-
proxy_id = update.proxyId
637-
638640
# set update query
639641
query = update.dict(exclude_unset=True)
640642
query["modifiedBy"] = user.id
@@ -646,15 +648,15 @@ async def update_crawl_config(
646648
query["profileid"] = None
647649
# else, ensure its a valid profile
648650
elif update.profileid:
649-
profile = await self.profiles.get_profile(cast(UUID, update.profileid), org)
651+
await self.profiles.get_profile(cast(UUID, update.profileid), org)
650652
query["profileid"] = update.profileid
651-
proxy_id = profile.proxyId
652-
# don't change the proxy if profile is set, as it should match the profile proxy
653-
elif orig_crawl_config.profileid:
654-
proxy_id = None
655653

656-
if proxy_id is not None:
657-
query["proxyId"] = proxy_id
654+
if no_profile:
655+
if update.proxyId == "":
656+
query["proxyId"] = None
657+
elif update.proxyId:
658+
self.assert_can_org_use_proxy(org, update.proxyId)
659+
query["proxyId"] = update.proxyId
658660

659661
if update.config is not None:
660662
query["config"] = update.config.dict()
@@ -1025,9 +1027,10 @@ async def get_crawl_config_out(self, cid: UUID, org: Organization):
10251027
await self._add_running_curr_crawl_stats(crawlconfig)
10261028

10271029
if crawlconfig.profileid:
1028-
crawlconfig.profileName = await self.profiles.get_profile_name(
1029-
crawlconfig.profileid, org
1030-
)
1030+
profile = await self.profiles.get_profile(crawlconfig.profileid, org)
1031+
if profile:
1032+
crawlconfig.profileName = profile.name
1033+
crawlconfig.proxyId = profile.proxyId
10311034

10321035
crawlconfig.config.seeds = None
10331036

@@ -1241,8 +1244,8 @@ async def run_now_internal(
12411244
else:
12421245
profile_filename = ""
12431246

1244-
if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId):
1245-
raise HTTPException(status_code=404, detail="proxy_not_found")
1247+
if crawlconfig.proxyId:
1248+
self.assert_can_org_use_proxy(org, crawlconfig.proxyId)
12461249

12471250
storage_filename = (
12481251
crawlconfig.crawlFilenameTemplate or self.default_filename_template
@@ -1418,6 +1421,11 @@ def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> boo
14181421
_proxy.shared and org.allowSharedProxies
14191422
) or _proxy.id in org.allowedProxies
14201423

1424+
def assert_can_org_use_proxy(self, org: Organization, proxy: str):
1425+
"""assert that proxy can be used or throw error"""
1426+
if self.can_org_use_proxy(org, proxy):
1427+
raise HTTPException(status_code=400, detail="proxy_not_found")
1428+
14211429
def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
14221430
"""Generate WARC prefix slug from org slug, name or url
14231431
if no name is provided, hostname is used from url, otherwise

backend/btrixcloud/db.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
) = PageOps = BackgroundJobOps = FileUploadOps = CrawlLogOps = CrawlManager = object
3636

3737

38-
CURR_DB_VERSION = "0052"
38+
CURR_DB_VERSION = "0054"
3939

4040

4141
# ============================================================================
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""
2+
Migration 0054 -- clear proxyId on workflows that have profile set
3+
using proxyId from profile always
4+
"""
5+
6+
from btrixcloud.migrations import BaseMigration
7+
8+
9+
MIGRATION_VERSION = "0054"
10+
11+
12+
class Migration(BaseMigration):
13+
"""Migration class."""
14+
15+
# pylint: disable=unused-argument
16+
def __init__(self, mdb, **kwargs):
17+
super().__init__(mdb, migration_version=MIGRATION_VERSION)
18+
19+
async def migrate_up(self):
20+
"""Perform migration up.
21+
22+
Unset proxyId on workflows that have a profileid set
23+
"""
24+
crawl_configs = self.mdb["crawl_configs"]
25+
26+
# Set non-public collections to private
27+
try:
28+
await crawl_configs.update_many(
29+
{"profileid": {"$ne": None}, "proxyId": {"$ne": None}},
30+
{"$set": {"proxyId": None}},
31+
)
32+
# pylint: disable=broad-exception-caught
33+
except Exception as err:
34+
print(
35+
f"Error update crawl_configs: {err}",
36+
flush=True,
37+
)

0 commit comments

Comments
 (0)