From b00f59d6a94b712d581a6c78062170dedd2c6e93 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 20 Nov 2025 17:55:14 -0800 Subject: [PATCH 01/20] profile constraint follow-up: - follow-up to #2988, alternative to #3008 - clear proxyId in workflow if using profile, ensure it is always empty for new workflows/updated workflows that have a profile - add migration to clear proxyId if profileid is set for existing workflows - avoids having to update proxyId in workflows when it changes in the profile - fix assertion when updating proxy to return 400 if proxy is invalid --- backend/btrixcloud/basecrawls.py | 4 +- backend/btrixcloud/crawlconfigs.py | 52 +++++++++++-------- backend/btrixcloud/db.py | 2 +- ..._0054_clear_proxyid_when_using_profiles.py | 37 +++++++++++++ 4 files changed, 71 insertions(+), 24 deletions(-) create mode 100644 backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 2cf09a0f85..75f47a9d74 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -464,9 +464,11 @@ async def _resolve_crawl_refs( raise HTTPException(status_code=400, detail="missing_org") if hasattr(crawl, "profileid") and crawl.profileid: - crawl.profileName = await self.crawl_configs.profiles.get_profile_name( + profile = await self.crawl_configs.profiles.get_profile( crawl.profileid, org ) + if profile: + crawl.profileName = profile.name if ( files diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 552493f170..594ac381b5 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -265,13 +265,10 @@ async def add_crawl_config( proxy_id = config_in.proxyId profileid = None + # ensure profile is valid, get proxy_id from profile if isinstance(config_in.profileid, UUID): profileid = config_in.profileid - - # ensure profile is valid, get proxy_id from profile - if profileid: - profile = await self.profiles.get_profile(profileid, org) - proxy_id = profile.proxyId + proxy_id = None else: if config_in.config and config_in.config.failOnContentCheck: raise HTTPException( @@ -280,8 +277,7 @@ async def add_crawl_config( # ensure proxy_id is valid and available for org if proxy_id: - if not self.can_org_use_proxy(org, proxy_id): - raise HTTPException(status_code=404, detail="proxy_not_found") + self.assert_can_org_use_proxy(org, proxy_id) if config_in.config.exclude: exclude = config_in.config.exclude @@ -602,7 +598,15 @@ async def update_crawl_config( and ((not update.profileid) != (not orig_crawl_config.profileid)) ) - changed = changed or (orig_crawl_config.proxyId != update.proxyId) + # either unsetting profile or no profile set on current config + no_profile = update.profileid == "" or not orig_crawl_config.profileid + + changed = changed or ( + no_profile + and update.proxyId is not None + and orig_crawl_config.proxyId != update.proxyId + and ((not update.proxyId) != (not orig_crawl_config.proxyId)) + ) metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name") metadata_changed = metadata_changed or self.check_attr_changed( @@ -633,8 +637,6 @@ async def update_crawl_config( last_rev = ConfigRevision(**orig_dict) last_rev = await self.config_revs.insert_one(last_rev.to_dict()) - proxy_id = update.proxyId - # set update query query = update.dict(exclude_unset=True) query["modifiedBy"] = user.id @@ -646,15 +648,15 @@ async def update_crawl_config( query["profileid"] = None # else, ensure its a valid profile elif update.profileid: - profile = await self.profiles.get_profile(cast(UUID, update.profileid), org) + await self.profiles.get_profile(cast(UUID, update.profileid), org) query["profileid"] = update.profileid - proxy_id = profile.proxyId - # don't change the proxy if profile is set, as it should match the profile proxy - elif orig_crawl_config.profileid: - proxy_id = None - if proxy_id is not None: - query["proxyId"] = proxy_id + if no_profile: + if update.proxyId == "": + query["proxyId"] = None + elif update.proxyId: + self.assert_can_org_use_proxy(org, update.proxyId) + query["proxyId"] = update.proxyId if update.config is not None: query["config"] = update.config.dict() @@ -1025,9 +1027,10 @@ async def get_crawl_config_out(self, cid: UUID, org: Organization): await self._add_running_curr_crawl_stats(crawlconfig) if crawlconfig.profileid: - crawlconfig.profileName = await self.profiles.get_profile_name( - crawlconfig.profileid, org - ) + profile = await self.profiles.get_profile(crawlconfig.profileid, org) + if profile: + crawlconfig.profileName = profile.name + crawlconfig.proxyId = profile.proxyId crawlconfig.config.seeds = None @@ -1241,8 +1244,8 @@ async def run_now_internal( else: profile_filename = "" - if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId): - raise HTTPException(status_code=404, detail="proxy_not_found") + if crawlconfig.proxyId: + self.assert_can_org_use_proxy(org, crawlconfig.proxyId) storage_filename = ( crawlconfig.crawlFilenameTemplate or self.default_filename_template @@ -1418,6 +1421,11 @@ def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> boo _proxy.shared and org.allowSharedProxies ) or _proxy.id in org.allowedProxies + def assert_can_org_use_proxy(self, org: Organization, proxy: str): + """assert that proxy can be used or throw error""" + if self.can_org_use_proxy(org, proxy): + raise HTTPException(status_code=400, detail="proxy_not_found") + def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str: """Generate WARC prefix slug from org slug, name or url if no name is provided, hostname is used from url, otherwise diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index c9b403e696..278014deff 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -35,7 +35,7 @@ ) = PageOps = BackgroundJobOps = FileUploadOps = CrawlLogOps = CrawlManager = object -CURR_DB_VERSION = "0052" +CURR_DB_VERSION = "0054" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py b/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py new file mode 100644 index 0000000000..25e8a82c7c --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py @@ -0,0 +1,37 @@ +""" +Migration 0054 -- clear proxyId on workflows that have profile set +using proxyId from profile always +""" + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0054" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + async def migrate_up(self): + """Perform migration up. + + Unset proxyId on workflows that have a profileid set + """ + crawl_configs = self.mdb["crawl_configs"] + + # Set non-public collections to private + try: + await crawl_configs.update_many( + {"profileid": {"$ne": None}, "proxyId": {"$ne": None}}, + {"$set": {"proxyId": None}}, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error update crawl_configs: {err}", + flush=True, + ) From a4a2445c7f97c492998f753bfff3f076b8096f42 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 21 Nov 2025 15:24:21 -0800 Subject: [PATCH 02/20] fix typo! --- backend/btrixcloud/crawlconfigs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 594ac381b5..7991c46b64 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1423,7 +1423,7 @@ def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> boo def assert_can_org_use_proxy(self, org: Organization, proxy: str): """assert that proxy can be used or throw error""" - if self.can_org_use_proxy(org, proxy): + if not self.can_org_use_proxy(org, proxy): raise HTTPException(status_code=400, detail="proxy_not_found") def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str: From ad66915d991eb2a3dbf812f1749b83b7f486ad89 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 21 Nov 2025 15:26:45 -0800 Subject: [PATCH 03/20] none check --- backend/btrixcloud/crawlconfigs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 7991c46b64..d014d7b315 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -1421,9 +1421,9 @@ def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> boo _proxy.shared and org.allowSharedProxies ) or _proxy.id in org.allowedProxies - def assert_can_org_use_proxy(self, org: Organization, proxy: str): + def assert_can_org_use_proxy(self, org: Organization, proxy: Optional[str]): """assert that proxy can be used or throw error""" - if not self.can_org_use_proxy(org, proxy): + if proxy and not self.can_org_use_proxy(org, proxy): raise HTTPException(status_code=400, detail="proxy_not_found") def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str: From 8ce01dc96830e8eefddb58a1b9ff379fa3c8b107 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 22 Nov 2025 17:09:08 -0800 Subject: [PATCH 04/20] org import: attempt to debug test failure also clear proxyId if profileid is set --- backend/btrixcloud/orgs.py | 4 ++++ backend/test/test_y_org_import_export.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 8acb083b73..ea7d5d4f1e 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -1300,6 +1300,10 @@ async def import_org( if not workflow.get("crawlerChannel"): workflow["crawlerChannel"] = "default" + # Ensure proxyId is unset if profile is set + if workflow.get("profileid"): + workflow["proxyId"] = None + crawl_config = CrawlConfig.from_dict(workflow) await self.crawl_configs_db.insert_one(crawl_config.to_dict()) diff --git a/backend/test/test_y_org_import_export.py b/backend/test/test_y_org_import_export.py index a259dd9383..43b12db17d 100644 --- a/backend/test/test_y_org_import_export.py +++ b/backend/test/test_y_org_import_export.py @@ -172,7 +172,8 @@ def test_import_org(admin_auth_headers): f"{API_PREFIX}/orgs/{ORG_FIXTURE_UUID}/all-crawls", headers=admin_auth_headers, ) - assert r.status_code == 200 + print(r.text()) + #assert r.status_code == 200 data = r.json() assert data["total"] == 4 From c667eded815e7df9fcf16adaf8525987b0dfee93 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 22 Nov 2025 17:55:35 -0800 Subject: [PATCH 05/20] fix --- backend/test/test_y_org_import_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/test/test_y_org_import_export.py b/backend/test/test_y_org_import_export.py index 43b12db17d..104debb60b 100644 --- a/backend/test/test_y_org_import_export.py +++ b/backend/test/test_y_org_import_export.py @@ -172,7 +172,7 @@ def test_import_org(admin_auth_headers): f"{API_PREFIX}/orgs/{ORG_FIXTURE_UUID}/all-crawls", headers=admin_auth_headers, ) - print(r.text()) + print(r.text) #assert r.status_code == 200 data = r.json() assert data["total"] == 4 From b40294c28624bb9eeb8f85a72c9ecac697c663b2 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 22 Nov 2025 18:56:20 -0800 Subject: [PATCH 06/20] add 'id: UUID' to Profile to fix import --- backend/btrixcloud/models.py | 2 ++ backend/test/test_y_org_import_export.py | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e729730c2b..05c8502daf 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -2428,6 +2428,8 @@ class ProfileFile(BaseFile): class Profile(BaseMongoModel): """Browser profile""" + id: UUID + name: str description: Optional[str] = "" diff --git a/backend/test/test_y_org_import_export.py b/backend/test/test_y_org_import_export.py index 104debb60b..a259dd9383 100644 --- a/backend/test/test_y_org_import_export.py +++ b/backend/test/test_y_org_import_export.py @@ -172,8 +172,7 @@ def test_import_org(admin_auth_headers): f"{API_PREFIX}/orgs/{ORG_FIXTURE_UUID}/all-crawls", headers=admin_auth_headers, ) - print(r.text) - #assert r.status_code == 200 + assert r.status_code == 200 data = r.json() assert data["total"] == 4 From 24f0c5f70ae81cfcdd1baadfb7ce6fafd537e8f6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 23 Nov 2025 00:52:27 -0800 Subject: [PATCH 07/20] tests: attempt to set lower storage sizes for crawler, redis and profile browser dir --- chart/test/test.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/chart/test/test.yaml b/chart/test/test.yaml index b3f62e86d7..aac4cb2117 100644 --- a/chart/test/test.yaml +++ b/chart/test/test.yaml @@ -16,6 +16,12 @@ operator_resync_seconds: 3 qa_scale: 2 +# lower storage sizes +redis_storage: "100Mi" +profile_browser_workdir_size: "100Mi" +crawler_storage: "1Gi" + + # for testing only crawler_extra_cpu_per_browser: 300m From d09bcb55e0d641fe8398622783f6fe196034d5e4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 10:28:01 -0800 Subject: [PATCH 08/20] fix no_profile condition --- backend/btrixcloud/crawlconfigs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index d014d7b315..cb2c000033 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -599,7 +599,9 @@ async def update_crawl_config( ) # either unsetting profile or no profile set on current config - no_profile = update.profileid == "" or not orig_crawl_config.profileid + no_profile = update.profileid == "" or ( + update.profileid is None and not orig_crawl_config.profileid + ) changed = changed or ( no_profile From b8e8777db588c3077ef8c433af1c8cf52e189ecb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 10:37:55 -0800 Subject: [PATCH 09/20] check profile.proxyId --- backend/btrixcloud/crawlconfigs.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index cb2c000033..7dccf4fe05 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -276,8 +276,7 @@ async def add_crawl_config( ) # ensure proxy_id is valid and available for org - if proxy_id: - self.assert_can_org_use_proxy(org, proxy_id) + self.assert_can_org_use_proxy(org, proxy_id) if config_in.config.exclude: exclude = config_in.config.exclude @@ -650,7 +649,8 @@ async def update_crawl_config( query["profileid"] = None # else, ensure its a valid profile elif update.profileid: - await self.profiles.get_profile(cast(UUID, update.profileid), org) + profile = await self.profiles.get_profile(cast(UUID, update.profileid), org) + self.assert_can_org_use_proxy(org, profile.proxyId) query["profileid"] = update.profileid if no_profile: @@ -1246,8 +1246,7 @@ async def run_now_internal( else: profile_filename = "" - if crawlconfig.proxyId: - self.assert_can_org_use_proxy(org, crawlconfig.proxyId) + self.assert_can_org_use_proxy(org, crawlconfig.proxyId) storage_filename = ( crawlconfig.crawlFilenameTemplate or self.default_filename_template From ba7388f82cdeb69c14d0e20e606348ae49ed6b25 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 10:39:10 -0800 Subject: [PATCH 10/20] Apply suggestion from @ikreymer --- .../migration_0054_clear_proxyid_when_using_profiles.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py b/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py index 25e8a82c7c..48c2f3e012 100644 --- a/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py +++ b/backend/btrixcloud/migrations/migration_0054_clear_proxyid_when_using_profiles.py @@ -23,7 +23,6 @@ async def migrate_up(self): """ crawl_configs = self.mdb["crawl_configs"] - # Set non-public collections to private try: await crawl_configs.update_many( {"profileid": {"$ne": None}, "proxyId": {"$ne": None}}, From 0c568a4757d81136c2dd0892944f377577ba6178 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 12:13:13 -0800 Subject: [PATCH 11/20] ci: add cleanup action --- .github/workflows/k3d-ci.yaml | 5 +++++ chart/test/test.yaml | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/k3d-ci.yaml b/.github/workflows/k3d-ci.yaml index 30466d86a3..ba366f2fbf 100644 --- a/.github/workflows/k3d-ci.yaml +++ b/.github/workflows/k3d-ci.yaml @@ -20,6 +20,11 @@ jobs: outputs: matches: ${{ steps.filter.outputs.matches }} steps: + - uses: mathio/gha-cleanup@v1 + with: + remove-browsers: true + verbose: true + - uses: actions/checkout@v4 with: fetch-depth: 2 # important, to fetch previous commit diff --git a/chart/test/test.yaml b/chart/test/test.yaml index aac4cb2117..c01b7c7b26 100644 --- a/chart/test/test.yaml +++ b/chart/test/test.yaml @@ -19,7 +19,7 @@ qa_scale: 2 # lower storage sizes redis_storage: "100Mi" profile_browser_workdir_size: "100Mi" -crawler_storage: "1Gi" +crawler_storage: "500Mi" # for testing only From 4576d6d88faa37b142a05dc0a37b13018a38bcb6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 12:16:52 -0800 Subject: [PATCH 12/20] fix --- .github/workflows/k3d-ci.yaml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/k3d-ci.yaml b/.github/workflows/k3d-ci.yaml index ba366f2fbf..6e5ac7185e 100644 --- a/.github/workflows/k3d-ci.yaml +++ b/.github/workflows/k3d-ci.yaml @@ -20,11 +20,6 @@ jobs: outputs: matches: ${{ steps.filter.outputs.matches }} steps: - - uses: mathio/gha-cleanup@v1 - with: - remove-browsers: true - verbose: true - - uses: actions/checkout@v4 with: fetch-depth: 2 # important, to fetch previous commit @@ -50,6 +45,13 @@ jobs: needs: paths-filter if: needs.paths-filter.outputs.matches == 'true' steps: + - name: Initial Disk Cleanup + uses: mathio/gha-cleanup@v1 + with: + remove-browsers: true + verbose: true + + - name: Create k3d Cluster uses: AbsaOSS/k3d-action@v2 with: From f77ecca5642737a1b01f4968880f29947d81c315 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 13:46:13 -0800 Subject: [PATCH 13/20] tests: test simple profile update --- backend/test/test_crawlconfigs.py | 41 +++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 858cdc2c9a..96748132b6 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -1074,6 +1074,47 @@ def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id): assert page["url"] +def test_update_profile( + crawler_auth_headers, default_org_id, profile_2_config_id, profile_id, profile_2_id +): + + # Add profile + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/", + headers=crawler_auth_headers, + json={"profileid": profile_2_id}, + ) + assert r.status_code == 200 + data = r.json() + assert data["settings_changed"] == True + assert data["metadata_changed"] == False + assert data["profileid"] == profile_2_id + + # Add different profile + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/", + headers=crawler_auth_headers, + json={"profileid": profile_id}, + ) + assert r.status_code == 200 + data = r.json() + assert data["settings_changed"] == True + assert data["metadata_changed"] == False + assert data["profileid"] == profile_id + + # Remove profile + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/", + headers=crawler_auth_headers, + json={"profileid": ""}, + ) + assert r.status_code == 200 + data = r.json() + assert data["settings_changed"] == True + assert data["metadata_changed"] == False + assert data["profileid"] == "" + + def test_add_crawl_config_fail_on_content_check_no_profile( crawler_auth_headers, default_org_id, sample_crawl_data ): From eb836534b9fadf42567ae91aa6bec9e7d7efb701 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 13:48:51 -0800 Subject: [PATCH 14/20] tests: retry if non-json response (eg. error code) --- backend/test/test_uploads.py | 38 +++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index cc23dbb11c..2766272618 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -1241,26 +1241,29 @@ def test_delete_form_upload_and_crawls_from_all_crawls( f"{API_PREFIX}/orgs/{default_org_id}/metrics", headers=admin_auth_headers, ) - data = r.json() + try: + data = r.json() - all_good = True + all_good = True - if data["storageUsedBytes"] != org_bytes - total_size: - all_good = False + if data["storageUsedBytes"] != org_bytes - total_size: + all_good = False - if data["storageUsedCrawls"] != org_crawl_bytes - combined_crawl_size: - all_good = False + if data["storageUsedCrawls"] != org_crawl_bytes - combined_crawl_size: + all_good = False - if data["storageUsedUploads"] != org_upload_bytes - upload_size: - all_good = False + if data["storageUsedUploads"] != org_upload_bytes - upload_size: + all_good = False - if all_good: - break + if all_good: + break - if count + 1 == MAX_ATTEMPTS: - assert data["storageUsedBytes"] == org_bytes - total_size - assert data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size - assert data["storageUsedUploads"] == org_upload_bytes - upload_size + if count + 1 == MAX_ATTEMPTS: + assert data["storageUsedBytes"] == org_bytes - total_size + assert data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size + assert data["storageUsedUploads"] == org_upload_bytes - upload_size + except: + pass time.sleep(5) count += 1 @@ -1271,8 +1274,11 @@ def test_delete_form_upload_and_crawls_from_all_crawls( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{all_crawls_delete_config_id}", headers=admin_auth_headers, ) - if r.json()["totalSize"] == workflow_size - combined_crawl_size: - break + try: + if r.json()["totalSize"] == workflow_size - combined_crawl_size: + break + except: + pass if count + 1 == MAX_ATTEMPTS: assert False From 90447143db72ffa93d8dd2662b71ffe8c1327370 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 14:38:31 -0800 Subject: [PATCH 15/20] fix profile updating to different profile! --- backend/btrixcloud/crawlconfigs.py | 4 ++-- backend/test/test_crawlconfigs.py | 26 +++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 7dccf4fe05..fea08e08b4 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -594,7 +594,7 @@ async def update_crawl_config( changed = changed or ( update.profileid is not None and update.profileid != orig_crawl_config.profileid - and ((not update.profileid) != (not orig_crawl_config.profileid)) + and not (update.profileid == "" and not orig_crawl_config.profileid) ) # either unsetting profile or no profile set on current config @@ -606,7 +606,7 @@ async def update_crawl_config( no_profile and update.proxyId is not None and orig_crawl_config.proxyId != update.proxyId - and ((not update.proxyId) != (not orig_crawl_config.proxyId)) + and not (update.proxyId == "" and not orig_crawl_config.proxyId) ) metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name") diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 96748132b6..d05e76eb05 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -1086,9 +1086,21 @@ def test_update_profile( ) assert r.status_code == 200 data = r.json() + assert data["profileid"] == profile_2_id assert data["settings_changed"] == True assert data["metadata_changed"] == False + + # Same profile + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/", + headers=crawler_auth_headers, + json={"profileid": profile_2_id}, + ) + assert r.status_code == 200 + data = r.json() assert data["profileid"] == profile_2_id + assert data["settings_changed"] == False + assert data["metadata_changed"] == False # Add different profile r = requests.patch( @@ -1098,9 +1110,9 @@ def test_update_profile( ) assert r.status_code == 200 data = r.json() + assert data["profileid"] == profile_id assert data["settings_changed"] == True assert data["metadata_changed"] == False - assert data["profileid"] == profile_id # Remove profile r = requests.patch( @@ -1110,9 +1122,21 @@ def test_update_profile( ) assert r.status_code == 200 data = r.json() + assert data["profileid"] == "" assert data["settings_changed"] == True assert data["metadata_changed"] == False + + # No change + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/", + headers=crawler_auth_headers, + json={"profileid": ""}, + ) + assert r.status_code == 200 + data = r.json() assert data["profileid"] == "" + assert data["settings_changed"] == False + assert data["metadata_changed"] == False def test_add_crawl_config_fail_on_content_check_no_profile( From 76816b2c43e277c69e5cf2dfa5ccb4f8e8a751d6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 15:14:30 -0800 Subject: [PATCH 16/20] fix test --- backend/test/test_crawlconfigs.py | 71 ++++++++++++------------------- backend/test/test_uploads.py | 4 +- 2 files changed, 31 insertions(+), 44 deletions(-) diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index d05e76eb05..55e558cb01 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -1074,69 +1074,54 @@ def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id): assert page["url"] -def test_update_profile( - crawler_auth_headers, default_org_id, profile_2_config_id, profile_id, profile_2_id -): +def test_update_profile(crawler_auth_headers, default_org_id, profile_id, profile_2_id): + + def get_profile(): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + return r.json() + + def update_profile(profileid): + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + json={"profileid": profileid}, + ) + assert r.status_code == 200 + return r.json() # Add profile - r = requests.patch( - f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/", - headers=crawler_auth_headers, - json={"profileid": profile_2_id}, - ) - assert r.status_code == 200 - data = r.json() - assert data["profileid"] == profile_2_id + data = update_profile(profile_2_id) assert data["settings_changed"] == True assert data["metadata_changed"] == False + assert get_profile() == profile_2_id # Same profile - r = requests.patch( - f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/", - headers=crawler_auth_headers, - json={"profileid": profile_2_id}, - ) - assert r.status_code == 200 - data = r.json() - assert data["profileid"] == profile_2_id + data = update_profile(profile_2_id) assert data["settings_changed"] == False assert data["metadata_changed"] == False + assert get_profile() == profile_2_id # Add different profile - r = requests.patch( - f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/", - headers=crawler_auth_headers, - json={"profileid": profile_id}, - ) - assert r.status_code == 200 - data = r.json() - assert data["profileid"] == profile_id + data = update_profile(profile_id) assert data["settings_changed"] == True assert data["metadata_changed"] == False + assert get_profile() == profile_id # Remove profile - r = requests.patch( - f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/", - headers=crawler_auth_headers, - json={"profileid": ""}, - ) - assert r.status_code == 200 - data = r.json() - assert data["profileid"] == "" + data = update_profile("") assert data["settings_changed"] == True assert data["metadata_changed"] == False + assert get_profile() == "" # No change - r = requests.patch( - f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/", - headers=crawler_auth_headers, - json={"profileid": ""}, - ) - assert r.status_code == 200 - data = r.json() - assert data["profileid"] == "" + data = update_profile("") assert data["settings_changed"] == False assert data["metadata_changed"] == False + assert get_profile() == "" def test_add_crawl_config_fail_on_content_check_no_profile( diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 2766272618..0f2c0570e4 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -1260,7 +1260,9 @@ def test_delete_form_upload_and_crawls_from_all_crawls( if count + 1 == MAX_ATTEMPTS: assert data["storageUsedBytes"] == org_bytes - total_size - assert data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size + assert ( + data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size + ) assert data["storageUsedUploads"] == org_upload_bytes - upload_size except: pass From 3270b12797d399ab1dbf83bebb3886768e42ad89 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 15:35:04 -0800 Subject: [PATCH 17/20] fix typo --- backend/test/test_crawlconfigs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 55e558cb01..61ff349cbf 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -1082,7 +1082,7 @@ def get_profile(): headers=crawler_auth_headers, ) assert r.status_code == 200 - return r.json() + return r.json()["profileid"] def update_profile(profileid): r = requests.patch( From 668671e8eee3bcf3ee52d38c2c107d366d60359c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 16:12:09 -0800 Subject: [PATCH 18/20] fix tests, allow retry if deletion returns an invalid status --- backend/test/test_crawlconfigs.py | 7 +++++-- backend/test/test_uploads.py | 21 +++++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 61ff349cbf..036a2734b0 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -1093,6 +1093,9 @@ def update_profile(profileid): assert r.status_code == 200 return r.json() + # No profile to start + assert get_profile() == None + # Add profile data = update_profile(profile_2_id) assert data["settings_changed"] == True @@ -1115,13 +1118,13 @@ def update_profile(profileid): data = update_profile("") assert data["settings_changed"] == True assert data["metadata_changed"] == False - assert get_profile() == "" + assert get_profile() == None # No change data = update_profile("") assert data["settings_changed"] == False assert data["metadata_changed"] == False - assert get_profile() == "" + assert get_profile() == None def test_add_crawl_config_fail_on_content_check_no_profile( diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 0f2c0570e4..15d913057d 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -1225,12 +1225,21 @@ def test_delete_form_upload_and_crawls_from_all_crawls( assert r.json()["detail"] == "not_allowed" # Delete mixed type archived items - r = requests.post( - f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/delete", - headers=admin_auth_headers, - json={"crawl_ids": crawls_to_delete}, - ) - data = r.json() + count = 0 + data = {} + while count < MAX_ATTEMPTS: + try: + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/delete", + headers=admin_auth_headers, + json={"crawl_ids": crawls_to_delete}, + ) + data = r.json() + break + except: + time.sleep(5) + count += 1 + assert data["deleted"] assert data["storageQuotaReached"] is False From 52daf2f8a98189af21b0625895b72206841cc2d1 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 21:53:50 -0800 Subject: [PATCH 19/20] undo test changes, try to optimize delete --- backend/btrixcloud/basecrawls.py | 11 +++--- backend/test/test_uploads.py | 61 ++++++++++++-------------------- 2 files changed, 29 insertions(+), 43 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 75f47a9d74..806ee5ab7c 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -840,6 +840,12 @@ async def delete_crawls_all_types( crawls: list[str] = [] uploads: list[str] = [] + async def recompute(cids_to_update): + for cid, cid_dict in cids_to_update.items(): + cid_size = cid_dict["size"] + cid_inc = cid_dict["inc"] + await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc) + for crawl_id in delete_list.crawl_ids: crawl = await self.get_base_crawl(crawl_id, org) if crawl.type == "crawl": @@ -864,10 +870,7 @@ async def delete_crawls_all_types( ) deleted_count += deleted - for cid, cid_dict in cids_to_update.items(): - cid_size = cid_dict["size"] - cid_inc = cid_dict["inc"] - await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc) + asyncio.create_task(recompute(cids_to_update)) if uploads_length: upload_delete_list = DeleteCrawlList(crawl_ids=uploads) diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 15d913057d..cc23dbb11c 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -1225,21 +1225,12 @@ def test_delete_form_upload_and_crawls_from_all_crawls( assert r.json()["detail"] == "not_allowed" # Delete mixed type archived items - count = 0 - data = {} - while count < MAX_ATTEMPTS: - try: - r = requests.post( - f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/delete", - headers=admin_auth_headers, - json={"crawl_ids": crawls_to_delete}, - ) - data = r.json() - break - except: - time.sleep(5) - count += 1 - + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/delete", + headers=admin_auth_headers, + json={"crawl_ids": crawls_to_delete}, + ) + data = r.json() assert data["deleted"] assert data["storageQuotaReached"] is False @@ -1250,31 +1241,26 @@ def test_delete_form_upload_and_crawls_from_all_crawls( f"{API_PREFIX}/orgs/{default_org_id}/metrics", headers=admin_auth_headers, ) - try: - data = r.json() + data = r.json() - all_good = True + all_good = True - if data["storageUsedBytes"] != org_bytes - total_size: - all_good = False + if data["storageUsedBytes"] != org_bytes - total_size: + all_good = False - if data["storageUsedCrawls"] != org_crawl_bytes - combined_crawl_size: - all_good = False + if data["storageUsedCrawls"] != org_crawl_bytes - combined_crawl_size: + all_good = False - if data["storageUsedUploads"] != org_upload_bytes - upload_size: - all_good = False + if data["storageUsedUploads"] != org_upload_bytes - upload_size: + all_good = False - if all_good: - break + if all_good: + break - if count + 1 == MAX_ATTEMPTS: - assert data["storageUsedBytes"] == org_bytes - total_size - assert ( - data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size - ) - assert data["storageUsedUploads"] == org_upload_bytes - upload_size - except: - pass + if count + 1 == MAX_ATTEMPTS: + assert data["storageUsedBytes"] == org_bytes - total_size + assert data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size + assert data["storageUsedUploads"] == org_upload_bytes - upload_size time.sleep(5) count += 1 @@ -1285,11 +1271,8 @@ def test_delete_form_upload_and_crawls_from_all_crawls( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{all_crawls_delete_config_id}", headers=admin_auth_headers, ) - try: - if r.json()["totalSize"] == workflow_size - combined_crawl_size: - break - except: - pass + if r.json()["totalSize"] == workflow_size - combined_crawl_size: + break if count + 1 == MAX_ATTEMPTS: assert False From c2f80717328df75ec3d0041b073d5993b8b858cc Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 24 Nov 2025 22:31:03 -0800 Subject: [PATCH 20/20] undo change --- backend/btrixcloud/basecrawls.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 806ee5ab7c..75f47a9d74 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -840,12 +840,6 @@ async def delete_crawls_all_types( crawls: list[str] = [] uploads: list[str] = [] - async def recompute(cids_to_update): - for cid, cid_dict in cids_to_update.items(): - cid_size = cid_dict["size"] - cid_inc = cid_dict["inc"] - await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc) - for crawl_id in delete_list.crawl_ids: crawl = await self.get_base_crawl(crawl_id, org) if crawl.type == "crawl": @@ -870,7 +864,10 @@ async def recompute(cids_to_update): ) deleted_count += deleted - asyncio.create_task(recompute(cids_to_update)) + for cid, cid_dict in cids_to_update.items(): + cid_size = cid_dict["size"] + cid_inc = cid_dict["inc"] + await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc) if uploads_length: upload_delete_list = DeleteCrawlList(crawl_ids=uploads)