From d8e4f9388cb2a63314cac16e3a3d1b815f6340cf Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 5 Nov 2025 16:58:49 -0500 Subject: [PATCH 01/28] Pause crawls instead of stopping when quotas are reached --- backend/btrixcloud/crawls.py | 10 +++--- backend/btrixcloud/models.py | 21 +++++++++--- backend/btrixcloud/operator/crawls.py | 48 ++++++++++++++++++++------- backend/btrixcloud/operator/models.py | 3 ++ 4 files changed, 62 insertions(+), 20 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index f1519cec14..7c2200d83c 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -812,7 +812,11 @@ async def get_crawl_stats( return crawls_data async def pause_crawl( - self, crawl_id: str, org: Organization, pause: bool + self, + crawl_id: str, + org: Organization, + pause: bool, + paused_at: Optional[datetime] = None, ) -> Dict[str, bool]: """pause or resume a crawl temporarily""" crawl = await self.get_base_crawl(crawl_id, org) @@ -821,10 +825,8 @@ async def pause_crawl( result = None - if pause: + if pause and not paused_at: paused_at = dt_now() - else: - paused_at = None try: result = await self.crawl_manager.pause_resume_crawl( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e0347c6f29..8d1683a713 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -237,10 +237,23 @@ class UserOrgInfoOut(BaseModel): ] RUNNING_STATES = get_args(TYPE_RUNNING_STATES) -TYPE_WAITING_STATES = Literal[ - "starting", "waiting_capacity", "waiting_org_limit", "paused" +TYPE_PAUSED_STATES = Literal[ + "paused", + "paused_storage_quota_reached", + "paused_time_quota_reached", + "paused_org_readonly", ] -WAITING_STATES = get_args(TYPE_WAITING_STATES) +PAUSED_STATES = get_args(TYPE_PAUSED_STATES) + +TYPE_WAITING_NOT_PAUSED_STATES = Literal[ + "starting", + "waiting_capacity", + "waiting_org_limit", +] +WAITING_NOT_PAUSED_STATES = get_args(TYPE_WAITING_NOT_PAUSED_STATES) + +TYPE_WAITING_STATES = Literal[TYPE_PAUSED_STATES, TYPE_WAITING_NOT_PAUSED_STATES] +WAITING_STATES = [*PAUSED_STATES, *WAITING_NOT_PAUSED_STATES] TYPE_FAILED_STATES = Literal[ "canceled", @@ -260,7 +273,7 @@ class UserOrgInfoOut(BaseModel): "stopped_org_readonly", ] SUCCESSFUL_STATES = get_args(TYPE_SUCCESSFUL_STATES) -SUCCESSFUL_AND_PAUSED_STATES = ["paused", *SUCCESSFUL_STATES] +SUCCESSFUL_AND_PAUSED_STATES = [*PAUSED_STATES, *SUCCESSFUL_STATES] TYPE_RUNNING_AND_WAITING_STATES = Literal[TYPE_WAITING_STATES, TYPE_RUNNING_STATES] RUNNING_AND_WAITING_STATES = [*WAITING_STATES, *RUNNING_STATES] diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 51bf7a02bb..6e1bd20115 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -21,16 +21,19 @@ TYPE_NON_RUNNING_STATES, TYPE_RUNNING_STATES, TYPE_ALL_CRAWL_STATES, + TYPE_PAUSED_STATES, RUNNING_STATES, WAITING_STATES, RUNNING_AND_STARTING_ONLY, RUNNING_AND_WAITING_STATES, SUCCESSFUL_STATES, FAILED_STATES, + PAUSED_STATES, CrawlStats, CrawlFile, CrawlCompleteIn, StorageRef, + Organization, ) from btrixcloud.utils import ( @@ -394,7 +397,7 @@ async def sync_crawls(self, data: MCSyncData): if status.pagesFound < status.desiredScale: status.desiredScale = max(1, status.pagesFound) - is_paused = bool(crawl.paused_at) and status.state == "paused" + is_paused = bool(crawl.paused_at) and status.state in PAUSED_STATES for i in range(0, status.desiredScale): if status.pagesFound < i * num_browsers_per_pod: @@ -1421,7 +1424,7 @@ async def is_crawl_stopping( if crawl.stopping: return "stopped_by_user" - if crawl.paused_at: + if crawl.paused_at and status.stopReason not in PAUSED_STATES: return "paused" # check timeout if timeout time exceeds elapsed time @@ -1438,26 +1441,34 @@ async def is_crawl_stopping( if crawl.max_crawl_size and status.size > crawl.max_crawl_size: return "size-limit" - # gracefully stop crawl if current running crawl sizes reach storage quota + # pause crawl if current running crawl sizes reach storage quota org = crawl.org if org.readOnly: - return "stopped_org_readonly" + await self.pause_crawl(crawl, org) + return "paused_org_readonly" if org.quotas.storageQuota: active_crawls_total_size = await self.crawl_ops.get_active_crawls_size( crawl.oid ) - if self.org_ops.storage_quota_reached(org, active_crawls_total_size): - return "stopped_storage_quota_reached" + await self.pause_crawl(crawl, org) + return "paused_storage_quota_reached" # gracefully stop crawl is execution time quota is reached if self.org_ops.exec_mins_quota_reached(org): - return "stopped_time_quota_reached" + await self.pause_crawl(crawl, org) + return "paused_time_quota_reached" return None + async def pause_crawl(self, crawl: CrawlSpec, org: Organization): + """Pause crawl and update crawl spec""" + paused_at = dt_now() + await self.crawl_ops.pause_crawl(crawl.id, org, pause=True, paused_at=paused_at) + crawl.paused_at = paused_at + async def get_redis_crawl_stats( self, redis: Redis, crawl_id: str ) -> tuple[CrawlStats, dict[str, Any]]: @@ -1544,7 +1555,7 @@ async def update_crawl_state( ) # check if no longer paused, clear paused stopping state - if status.stopReason == "paused" and not crawl.paused_at: + if status.stopReason in PAUSED_STATES and not crawl.paused_at: status.stopReason = None status.stopping = False # should have already been removed, just in case @@ -1556,9 +1567,9 @@ async def update_crawl_state( # mark crawl as stopping if status.stopping: - if status.stopReason == "paused": + if status.stopReason in PAUSED_STATES: await redis.set(f"{crawl.id}:paused", "1") - print(f"Crawl pausing, id: {crawl.id}") + print(f"Crawl pausing: {status.stopReason}, id: {crawl.id}") else: await redis.set(f"{crawl.id}:stopping", "1") print( @@ -1582,14 +1593,27 @@ async def update_crawl_state( all_completed = (num_done + num_failed) >= status.scale # check paused - if not all_completed and crawl.paused_at and status.stopReason == "paused": + if not all_completed and crawl.paused_at and status.stopReason in PAUSED_STATES: num_paused = status_count.get("interrupted", 0) if (num_paused + num_failed) >= status.scale: # now fully paused! # remove pausing key and set state to paused + paused_state: TYPE_PAUSED_STATES + if status.stopReason == "paused_storage_quota_reached": + paused_state = "paused_storage_quota_reached" + if status.stopReason == "paused_time_quota_reached": + paused_state = "paused_time_quota_reached" + if status.stopReason == "paused_org_readoly": + paused_state = "paused_org_readonly" + else: + paused_state = "paused" + await redis.delete(f"{crawl.id}:paused") await self.set_state( - "paused", status, crawl, allowed_from=RUNNING_AND_WAITING_STATES + paused_state, + status, + crawl, + allowed_from=RUNNING_AND_WAITING_STATES, ) return status diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 93677d4bab..5abdbc896d 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -25,6 +25,9 @@ "stopped_storage_quota_reached", "stopped_time_quota_reached", "stopped_org_readonly", + "paused_storage_quota_reached", + "paused_time_quota_reached", + "paused_org_readonly", ] From bbdee7352a423f2d06625ca61141b10e00c98a9f Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 6 Nov 2025 11:38:36 -0500 Subject: [PATCH 02/28] Update nightly tests --- backend/test_nightly/test_execution_minutes_quota.py | 12 ++++++------ backend/test_nightly/test_storage_quota.py | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/backend/test_nightly/test_execution_minutes_quota.py b/backend/test_nightly/test_execution_minutes_quota.py index cd81d86330..3640438f60 100644 --- a/backend/test_nightly/test_execution_minutes_quota.py +++ b/backend/test_nightly/test_execution_minutes_quota.py @@ -29,7 +29,7 @@ def test_set_execution_mins_quota(org_with_quotas, admin_auth_headers): assert data.get("updated") == True -def test_crawl_stopped_when_quota_reached(org_with_quotas, admin_auth_headers): +def test_crawl_paused_when_quota_reached(org_with_quotas, admin_auth_headers): # Run crawl global config_id crawl_id, config_id = run_crawl(org_with_quotas, admin_auth_headers) @@ -49,10 +49,10 @@ def test_crawl_stopped_when_quota_reached(org_with_quotas, admin_auth_headers): ): time.sleep(2) - # Ensure that crawl was stopped by quota + # Ensure that crawl was paused by quota assert ( get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers) - == "stopped_time_quota_reached" + == "paused_time_quota_reached" ) time.sleep(5) @@ -108,7 +108,7 @@ def test_set_execution_mins_extra_quotas(org_with_quotas, admin_auth_headers): @pytest.mark.timeout(1200) -def test_crawl_stopped_when_quota_reached_with_extra( +def test_crawl_paused_when_quota_reached_with_extra( org_with_quotas, admin_auth_headers ): # Run crawl @@ -133,10 +133,10 @@ def test_crawl_stopped_when_quota_reached_with_extra( ): time.sleep(2) - # Ensure that crawl was stopped by quota + # Ensure that crawl was paused by quota assert ( get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers) - == "stopped_time_quota_reached" + == "paused_time_quota_reached" ) time.sleep(5) diff --git a/backend/test_nightly/test_storage_quota.py b/backend/test_nightly/test_storage_quota.py index 7945092415..dc8fa9f084 100644 --- a/backend/test_nightly/test_storage_quota.py +++ b/backend/test_nightly/test_storage_quota.py @@ -57,7 +57,7 @@ def test_storage_quota(org_with_quotas, admin_auth_headers): assert r.json()["updated"] -def test_crawl_stopped_when_storage_quota_reached(org_with_quotas, admin_auth_headers): +def test_crawl_paused_when_storage_quota_reached(org_with_quotas, admin_auth_headers): # Run crawl global config_id crawl_id, config_id = run_crawl(org_with_quotas, admin_auth_headers) @@ -82,7 +82,7 @@ def test_crawl_stopped_when_storage_quota_reached(org_with_quotas, admin_auth_he assert ( get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers) - == "stopped_storage_quota_reached" + == "paused_storage_quota_reached" ) # Ensure crawl storage went over quota From 3ed00212ff024da9a320449eee1a51f2886f7dd8 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 11 Nov 2025 12:12:25 -0500 Subject: [PATCH 03/28] Update frontend for new paused states --- frontend/README.md | 2 + .../features/archived-items/crawl-status.ts | 41 +++++++++++- .../workflow-action-menu.ts | 5 +- frontend/src/pages/org/workflow-detail.ts | 11 ++-- frontend/src/stories/design/action-menus.mdx | 5 +- .../src/stories/design/status-indicators.mdx | 64 +++++++++---------- frontend/src/types/crawlState.ts | 17 ++++- 7 files changed, 102 insertions(+), 43 deletions(-) diff --git a/frontend/README.md b/frontend/README.md index 33f2244e45..29d3742a77 100644 --- a/frontend/README.md +++ b/frontend/README.md @@ -5,12 +5,14 @@ This directory contains the following components: ## Docs (`/docs`) Documentation for running, developing, and writing docs: + - Hosted: [docs.browsertrix.com/develop/docs](https://docs.browsertrix.com/develop/docs) - Local: [develop/docs.md](./docs/docs/develop/docs.md) ## User Interface (`/src`) Documentation for developing the user interface: + - Hosted: [docs.browsertrix.com/develop/frontend-dev](https://docs.browsertrix.com/develop/frontend-dev) - Local: [develop/frontend-dev.md](./docs/docs/develop/frontend-dev.md) diff --git a/frontend/src/features/archived-items/crawl-status.ts b/frontend/src/features/archived-items/crawl-status.ts index ed5a23cdaa..0c2bac403f 100644 --- a/frontend/src/features/archived-items/crawl-status.ts +++ b/frontend/src/features/archived-items/crawl-status.ts @@ -5,7 +5,11 @@ import startCase from "lodash/fp/startCase"; import { TailwindElement } from "@/classes/TailwindElement"; import { labelWithIcon } from "@/layouts/labelWithIcon"; -import { RUNNING_STATES, type CrawlState } from "@/types/crawlState"; +import { + PAUSED_STATES, + RUNNING_STATES, + type CrawlState, +} from "@/types/crawlState"; import { animatePulse } from "@/utils/css"; type CrawlType = "crawl" | "qa"; @@ -170,6 +174,36 @@ export class CrawlStatus extends TailwindElement { label = msg("Paused"); break; + case "paused_storage_quota_reached": + color = "var(--sl-color-neutral-500)"; + icon = html``; + label = msg("Paused: Storage Quota Reached"); + break; + + case "paused_time_quota_reached": + color = "var(--sl-color-neutral-500)"; + icon = html``; + label = msg("Paused: Time Quota Reached"); + break; + + case "paused_org_readonly": + color = "var(--sl-color-neutral-500)"; + icon = html``; + label = msg("Paused: Crawling Disabled"); + break; + case "pending-wait": color = "var(--sl-color-violet-600)"; icon = html` diff --git a/frontend/src/stories/design/status-indicators.mdx b/frontend/src/stories/design/status-indicators.mdx index 4bae8e1077..144fb14294 100644 --- a/frontend/src/stories/design/status-indicators.mdx +++ b/frontend/src/stories/design/status-indicators.mdx @@ -1,6 +1,7 @@ -import { Meta, Canvas } from '@storybook/addon-docs/blocks'; -import { AllStates } from '../features/archived-items/CrawlStatus.stories'; +import { Canvas, Meta } from "@storybook/addon-docs/blocks"; + import { ColorSwatch } from "../../../.storybook/blocks/ColorSwatch"; +import { AllStates } from "../features/archived-items/CrawlStatus.stories"; @@ -21,39 +22,38 @@ further clarity as to what they indicate. ### Generalized Status -| Icon & Label | Color | Context | Description | -| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------- | --------------- | --------------------------------------------------------------- | -| 1 Crawl Running | Shoelace `--sl-color-success-600` | Dashboard | Count of crawls in "running" status when that count is non-zero | -| 0 Crawls Running | Shoelace Shoelace `--sl-color-neutral-600` | Dashboard | Count of crawls in "running" status when that count is zero | -| 1 Crawl Workflow Waiting | Shoelace `--sl-color-violet-600` | Dashboard | Count of crawls in "waiting" status | -| No Crawls Yet | Shoelace `--sl-color-neutral-400` | Crawl Workflows | Used to show that a workflow has no crawls | -| Complete | Shoelace `--sl-color-success-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was completed | -| Stopped | Shoelace `--sl-color-warning-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was stopped by the user | -| Stopped: [Reason] | Shoelace `--sl-color-warning-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was stopped due to system reason | -| Canceled | Shoelace `--sl-color-neutral-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was canceled | -| Starting | Shoelace `--sl-color-violet-600` | Crawl Workflows | Used to show that a crawl is starting | -| Running | Shoelace `--sl-color-success-600` | Crawl Workflows | Used to show that a crawl is running | -| Behavior timed out | Shoelace `--sl-color-warning-600` | Crawl Logs | Used to show a warning log from a behavior | -| Success | Shoelace `--sl-color-success-600` | Toasts | Used to show a success notification | -| Warning | Shoelace `--sl-color-warning-600` | Toasts | Used to show a warning notification | -| Danger | Shoelace `--sl-color-danger-600` | Toasts | Used to show an error notification | - +| Icon & Label | Color | Context | Description | +| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------- | --------------- | --------------------------------------------------------------------------------- | +| 1 Crawl Running | Shoelace `--sl-color-success-600` | Dashboard | Count of crawls in "running" status when that count is non-zero | +| 0 Crawls Running | Shoelace Shoelace `--sl-color-neutral-600` | Dashboard | Count of crawls in "running" status when that count is zero | +| 1 Crawl Workflow Waiting | Shoelace `--sl-color-violet-600` | Dashboard | Count of crawls in "waiting" status | +| No Crawls Yet | Shoelace `--sl-color-neutral-400` | Crawl Workflows | Used to show that a workflow has no crawls | +| Complete | Shoelace `--sl-color-success-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was completed | +| Stopped | Shoelace `--sl-color-warning-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was stopped by the user | +| Stopped: [Reason] | Shoelace `--sl-color-warning-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was stopped due to system reason | +| Canceled | Shoelace `--sl-color-neutral-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was canceled | +| Starting | Shoelace `--sl-color-violet-600` | Crawl Workflows | Used to show that a crawl is starting | +| Running | Shoelace `--sl-color-success-600` | Crawl Workflows | Used to show that a crawl is running | +| Behavior timed out | Shoelace `--sl-color-warning-600` | Crawl Logs | Used to show a warning log from a behavior | +| Success | Shoelace `--sl-color-success-600` | Toasts | Used to show a success notification | +| Warning | Shoelace `--sl-color-warning-600` | Toasts | Used to show a warning notification | +| Danger | Shoelace `--sl-color-danger-600` | Toasts | Used to show an error notification | ### Exact Status - + ## Intended Implementation -| Status | Color | Description | Icons | Examples | -| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Empty | Shoelace `--sl-color-neutral-400` | Used for empty states where no data is present | `slash-circle` | No Crawls Yet | -| Pending | Shoelace `--sl-color-violet-600` | Used when a process is queued or starting but is not yet running. Should be animated when indicating the status of a single object. | `hourglass-split`, or the icon of the next state being transitioned to (pulsing) | 1 Crawl Workflow Waiting
Starting
Resuming | -| Running | Shoelace `--sl-color-success-600` | Used when a process is actively running. Should be animated when indicating the status of a single object. | `dot` | Running | -| Paused | Shoelace `--sl-color-neutral-600` | Used for paused states | `pause-circle` or `play-circle` | Pause
Resume | -| Success | Shoelace `--sl-color-success-600` | Used for positive / successful states | `check-circle-fill` or `check2-circle` | Complete | -| Info | Shoelace `--sl-color-neutral-500` | Used for neutral, informational states | `info-circle-fill` | Behavior Log | -| Incomplete | Shoelace `--sl-color-warning-600` | Used for states that are ambiguous or partially satisfied, but no longer running | `dash-square-fill` | Stopped | -| Warning | Shoelace `--sl-color-warning-600` | Used for warning states, something is wrong but not critically | `exclamation-diamond-fill` or `exclamation-diamond` | Warning | -| Danger | Shoelace `--sl-color-danger-600` | Used for non-fatal errors that may be addressed by the user | `exclamation-triangle-fill` or `exclamation-triangle` | Payment Failed | -| Fatal | Shoelace `--sl-color-danger-600` | Used for fatal errors and actions that result in data loss | `x-octagon-fill` or `x-octagon` | Cancel | +| Status | Color | Description | Icons | Examples | +| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Empty | Shoelace `--sl-color-neutral-400` | Used for empty states where no data is present | `slash-circle` | No Crawls Yet | +| Pending | Shoelace `--sl-color-violet-600` | Used when a process is queued or starting but is not yet running. Should be animated when indicating the status of a single object. | `hourglass-split`, or the icon of the next state being transitioned to (pulsing) | 1 Crawl Workflow Waiting
Starting
Resuming | +| Running | Shoelace `--sl-color-success-600` | Used when a process is actively running. Should be animated when indicating the status of a single object. | `dot` | Running | +| Paused | Shoelace `--sl-color-neutral-600` | Used for paused states | `pause-circle` or `play-circle` | Pause
Resume | +| Success | Shoelace `--sl-color-success-600` | Used for positive / successful states | `check-circle-fill` or `check2-circle` | Complete | +| Info | Shoelace `--sl-color-neutral-500` | Used for neutral, informational states | `info-circle-fill` | Behavior Log | +| Incomplete | Shoelace `--sl-color-warning-600` | Used for states that are ambiguous or partially satisfied, but no longer running | `dash-square-fill` | Stopped | +| Warning | Shoelace `--sl-color-warning-600` | Used for warning states, something is wrong but not critically | `exclamation-diamond-fill` or `exclamation-diamond` | Warning | +| Danger | Shoelace `--sl-color-danger-600` | Used for non-fatal errors that may be addressed by the user | `exclamation-triangle-fill` or `exclamation-triangle` | Payment Failed | +| Fatal | Shoelace `--sl-color-danger-600` | Used for fatal errors and actions that result in data loss | `x-octagon-fill` or `x-octagon` | Cancel | diff --git a/frontend/src/types/crawlState.ts b/frontend/src/types/crawlState.ts index 6abc45b67b..f32e06c1ae 100644 --- a/frontend/src/types/crawlState.ts +++ b/frontend/src/types/crawlState.ts @@ -6,12 +6,25 @@ export const RUNNING_STATES = [ "uploading-wacz", ] as const; -// Match backend TYPE_WAITING_STATES in models.py -export const WAITING_STATES = [ +// Match backend TYPE_WAITING_NOT_PAUSED_STATES in models.py +export const WAITING_NOT_PAUSED_STATES = [ "starting", "waiting_capacity", "waiting_org_limit", +] as const; + +// Match backend TYPE_PAUSED_STATES in models.py +export const PAUSED_STATES = [ "paused", + "paused_storage_quota_reached", + "paused_time_quota_reached", + "paused_org_readonly", +] as const; + +// Match backend TYPE_WAITING_STATES in models.py +export const WAITING_STATES = [ + ...WAITING_NOT_PAUSED_STATES, + ...PAUSED_STATES, ] as const; // Match backend TYPE_SUCCESSFUL_STATES in models.py From 86144310ea865b8e9ee1c87d16ba4cc2ae2136d1 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 11 Nov 2025 12:59:33 -0500 Subject: [PATCH 04/28] Fix comments --- backend/btrixcloud/operator/crawls.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 6e1bd20115..0ca8929421 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1444,10 +1444,12 @@ async def is_crawl_stopping( # pause crawl if current running crawl sizes reach storage quota org = crawl.org + # pause crawl if org is set read-only if org.readOnly: await self.pause_crawl(crawl, org) return "paused_org_readonly" + # pause crawl if storage quota is reached if org.quotas.storageQuota: active_crawls_total_size = await self.crawl_ops.get_active_crawls_size( crawl.oid @@ -1456,7 +1458,7 @@ async def is_crawl_stopping( await self.pause_crawl(crawl, org) return "paused_storage_quota_reached" - # gracefully stop crawl is execution time quota is reached + # pause crawl if execution time quota is reached if self.org_ops.exec_mins_quota_reached(org): await self.pause_crawl(crawl, org) return "paused_time_quota_reached" From 7303c5bd30fbd51b8f15acb016ba8a95cf0147b2 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 11 Nov 2025 17:06:28 -0500 Subject: [PATCH 05/28] Fix status.stopReason handling for paused states --- backend/btrixcloud/operator/crawls.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 0ca8929421..55bcb342e4 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1603,9 +1603,9 @@ async def update_crawl_state( paused_state: TYPE_PAUSED_STATES if status.stopReason == "paused_storage_quota_reached": paused_state = "paused_storage_quota_reached" - if status.stopReason == "paused_time_quota_reached": + elif status.stopReason == "paused_time_quota_reached": paused_state = "paused_time_quota_reached" - if status.stopReason == "paused_org_readoly": + elif status.stopReason == "paused_org_readoly": paused_state = "paused_org_readonly" else: paused_state = "paused" @@ -1617,6 +1617,17 @@ async def update_crawl_state( crawl, allowed_from=RUNNING_AND_WAITING_STATES, ) + + # Add size of paused crawl uploaded WACZ to org so that + # the org knows it's over quota + # TODO: Make sure we don't double-count the storage if the + # crawl is resumed or stopped and completes + # TODO: Make sure we remove this size if crawl is canceled + # if paused_state == "paused_storage_quota_reached": + # await self.org_ops.inc_org_bytes_stored( + # crawl.oid, stats.size, "crawl" + # ) + return status # if at least one is done according to redis, consider crawl successful From 441a0963e0b3de8e621c764bc44aa1d4552c26e6 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 11 Nov 2025 17:23:26 -0500 Subject: [PATCH 06/28] Fix datetime deprecation in nightly test fixture --- backend/test_nightly/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/test_nightly/conftest.py b/backend/test_nightly/conftest.py index d984d4642c..d27f73cc86 100644 --- a/backend/test_nightly/conftest.py +++ b/backend/test_nightly/conftest.py @@ -1,7 +1,7 @@ import pytest import requests import time -import datetime +from datetime import datetime, timezone HOST_PREFIX = "http://127.0.0.1:30870" @@ -313,7 +313,7 @@ def error_crawl_id(admin_auth_headers, default_org_id): @pytest.fixture(scope="session") def org_with_quotas(admin_auth_headers): - name = "Quota Org " + datetime.datetime.utcnow().isoformat() + name = "Quota Org " + datetime.now(timezone.utc).isoformat() r = requests.post( f"{API_PREFIX}/orgs/create", headers=admin_auth_headers, json={"name": name} ) From c440a7cba78efb30288b0529b3fd4fa2a744ccfb Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 11 Nov 2025 18:19:59 -0500 Subject: [PATCH 07/28] WIP: Mark current issues with some TODOs --- backend/btrixcloud/operator/crawls.py | 45 +++++++++++++++++++++------ 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 55bcb342e4..3bc3a217fc 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1421,9 +1421,19 @@ async def is_crawl_stopping( ) -> Optional[StopReason]: """check if crawl is stopping and set reason""" # if user requested stop, then enter stopping phase + print( + f"Debugging is_crawl_stopping - status.stopReason: {status.stopReason}, paused_at: {crawl.paused_at}", + flush=True, + ) + if crawl.stopping: return "stopped_by_user" + # TODO: This is source of bug where status.stopReason is sometimes None while + # crawl.paused_at is (still?) set after pausing due to storage quota (maybe + # execution time quota too?), which results in state being incorrectly set + # to "paused" + # We still need something like this for manual pauses, however if crawl.paused_at and status.stopReason not in PAUSED_STATES: return "paused" @@ -1610,6 +1620,11 @@ async def update_crawl_state( else: paused_state = "paused" + print( + f"status.stopReason: {status.stopReason}, paused_state: {paused_state}", + flush=True, + ) + await redis.delete(f"{crawl.id}:paused") await self.set_state( paused_state, @@ -1618,15 +1633,18 @@ async def update_crawl_state( allowed_from=RUNNING_AND_WAITING_STATES, ) - # Add size of paused crawl uploaded WACZ to org so that - # the org knows it's over quota - # TODO: Make sure we don't double-count the storage if the - # crawl is resumed or stopped and completes - # TODO: Make sure we remove this size if crawl is canceled - # if paused_state == "paused_storage_quota_reached": - # await self.org_ops.inc_org_bytes_stored( - # crawl.oid, stats.size, "crawl" - # ) + # Add size of uploaded WACZ from paused crawl to org so that + # the org knows it's over its storage quota + # TODO: This is reached several times, so make it idempotent + # TODO: Should this be status.filesAddedSize or stats.size? + if paused_state == "paused_storage_quota_reached": + print( + f"Crawl paused for storage quota, adding size to org. status.filesAddedSize: {status.filesAddedSize}, stats.size: {stats.size}", + flush=True, + ) + # await self.org_ops.inc_org_bytes_stored( + # crawl.oid, status.filesAddedSize, "crawl" + # ) return status @@ -1712,6 +1730,13 @@ async def mark_finished( if state in SUCCESSFUL_STATES: await self.inc_crawl_complete_stats(crawl, finished) + else: + # TODO: Remove any already uploaded WACZ files (e.g. from + # paused crawls) from org storage count + # await self.org_ops.inc_org_bytes_stored( + # crawl.oid, -status.filesAddedSize, "crawl" + # ) + pass # Regular Crawl Finished if not crawl.is_qa: @@ -1738,6 +1763,8 @@ async def do_crawl_finished_tasks( if state in SUCCESSFUL_STATES and crawl.oid: await self.page_ops.set_archived_item_page_counts(crawl.id) + # TODO: Make sure WACZs from paused crawls that have already been + # added here aren't double-counted await self.org_ops.inc_org_bytes_stored( crawl.oid, status.filesAddedSize, "crawl" ) From 94daea72652ed5440241f83cd09a73f3901a790f Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 12 Nov 2025 12:00:19 -0500 Subject: [PATCH 08/28] WIP: Add debug logging to beginning of sync_crawls --- backend/btrixcloud/operator/crawls.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 3bc3a217fc..591d9ff290 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -200,6 +200,11 @@ async def sync_crawls(self, data: MCSyncData): seed_file_url=spec.get("seedFileUrl", ""), ) + print( + f"sync_crawls starting - crawl id: {crawl_id}, paused_at: {crawl.paused_at}, stopReason: {status.stopReason}", + flush=True, + ) + # if finalizing, crawl is being deleted if data.finalizing: if not status.finished: @@ -1421,11 +1426,6 @@ async def is_crawl_stopping( ) -> Optional[StopReason]: """check if crawl is stopping and set reason""" # if user requested stop, then enter stopping phase - print( - f"Debugging is_crawl_stopping - status.stopReason: {status.stopReason}, paused_at: {crawl.paused_at}", - flush=True, - ) - if crawl.stopping: return "stopped_by_user" @@ -1609,7 +1609,7 @@ async def update_crawl_state( num_paused = status_count.get("interrupted", 0) if (num_paused + num_failed) >= status.scale: # now fully paused! - # remove pausing key and set state to paused + # remove pausing key and set state to appropriate paused state paused_state: TYPE_PAUSED_STATES if status.stopReason == "paused_storage_quota_reached": paused_state = "paused_storage_quota_reached" @@ -1620,11 +1620,6 @@ async def update_crawl_state( else: paused_state = "paused" - print( - f"status.stopReason: {status.stopReason}, paused_state: {paused_state}", - flush=True, - ) - await redis.delete(f"{crawl.id}:paused") await self.set_state( paused_state, From a3217e926b95e678d7ee937e5c34d13b88b6fa43 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 12 Nov 2025 12:37:19 -0500 Subject: [PATCH 09/28] Modify execution time test to account for pausing --- backend/test_nightly/test_execution_minutes_quota.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/backend/test_nightly/test_execution_minutes_quota.py b/backend/test_nightly/test_execution_minutes_quota.py index 3640438f60..d84ca09d04 100644 --- a/backend/test_nightly/test_execution_minutes_quota.py +++ b/backend/test_nightly/test_execution_minutes_quota.py @@ -16,8 +16,6 @@ EXTRA_MINS_QUOTA = 5 EXTRA_SECS_QUOTA = EXTRA_MINS_QUOTA * 60 -config_id = None - def test_set_execution_mins_quota(org_with_quotas, admin_auth_headers): r = requests.post( @@ -31,7 +29,6 @@ def test_set_execution_mins_quota(org_with_quotas, admin_auth_headers): def test_crawl_paused_when_quota_reached(org_with_quotas, admin_auth_headers): # Run crawl - global config_id crawl_id, config_id = run_crawl(org_with_quotas, admin_auth_headers) time.sleep(1) @@ -112,12 +109,8 @@ def test_crawl_paused_when_quota_reached_with_extra( org_with_quotas, admin_auth_headers ): # Run crawl - r = requests.post( - f"{API_PREFIX}/orgs/{org_with_quotas}/crawlconfigs/{config_id}/run", - headers=admin_auth_headers, - ) - assert r.status_code == 200 - crawl_id = r.json()["started"] + crawl_id, config_id = run_crawl(org_with_quotas, admin_auth_headers) + time.sleep(1) while get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers) in ( "starting", From aeed3780a8c5fcdb0dfba2c612c0d8c4f5de00f8 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 12 Nov 2025 14:20:06 -0500 Subject: [PATCH 10/28] WIP: Add email notification --- backend/btrixcloud/crawls.py | 35 ++++- backend/btrixcloud/emailsender.py | 29 +++++ backend/btrixcloud/models.py | 10 +- backend/btrixcloud/operator/crawls.py | 8 ++ emails/emails/crawl-auto-paused.tsx | 181 ++++++++++++++++++++++++++ emails/emails/index.ts | 1 + 6 files changed, 260 insertions(+), 4 deletions(-) create mode 100644 emails/emails/crawl-auto-paused.tsx diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 7c2200d83c..bdc1dacc12 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -9,6 +9,7 @@ import urllib.parse from datetime import datetime from uuid import UUID +import asyncio from typing import ( Annotated, @@ -79,6 +80,8 @@ MatchCrawlQueueResponse, CrawlLogLine, TagsResponse, + TYPE_AUTO_PAUSED_STATES, + UserRole, ) @@ -93,7 +96,12 @@ class CrawlOps(BaseCrawlOps): crawl_manager: CrawlManager - def __init__(self, crawl_manager: CrawlManager, log_ops: CrawlLogOps, *args): + def __init__( + self, + crawl_manager: CrawlManager, + log_ops: CrawlLogOps, + *args, + ): super().__init__(*args) self.crawl_manager = crawl_manager self.log_ops = log_ops @@ -1197,6 +1205,25 @@ async def get_crawl_logs( qa_run_id=qa_run_id, ) + async def notify_org_admins_of_auto_paused_crawl( + self, + paused_reason: TYPE_AUTO_PAUSED_STATES, + cid: UUID, + org: Organization, + ): + """Send email to all org admins about automatically paused crawl""" + users = await self.orgs.get_users_for_org(org, UserRole.OWNER) + workflow = await self.crawl_configs.get_crawl_config(cid) + + await asyncio.gather( + *[ + self.user_manager.email.send_crawl_auto_paused( + user.email, paused_reason, workflow.lastCrawlPausedExpiry, cid, org + ) + for user in users + ] + ) + # ============================================================================ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str): @@ -1219,7 +1246,11 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str): # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, too-many-statements def init_crawls_api( - crawl_manager: CrawlManager, crawl_log_ops: CrawlLogOps, app, user_dep, *args + crawl_manager: CrawlManager, + crawl_log_ops: CrawlLogOps, + app, + user_dep, + *args, ): """API for crawl management, including crawl done callback""" # pylint: disable=invalid-name, duplicate-code diff --git a/backend/btrixcloud/emailsender.py b/backend/btrixcloud/emailsender.py index aa148daf92..5b285e24ea 100644 --- a/backend/btrixcloud/emailsender.py +++ b/backend/btrixcloud/emailsender.py @@ -20,6 +20,7 @@ Organization, InvitePending, Subscription, + TYPE_AUTO_PAUSED_STATES, ) from .utils import is_bool, get_origin @@ -250,3 +251,31 @@ async def send_subscription_trial_ending_soon( behavior_on_trial_end=behavior_on_trial_end, support_email=self.support_email, ) + + async def send_crawl_auto_paused( + self, + user_name: str, + receiver_email: str, + paused_reason: TYPE_AUTO_PAUSED_STATES, + paused_expiry: datetime, + cid: UUID, + org: Organization, + headers=None, + ): + """Send email indicating crawl was paused due to quota or disabled crawling""" + + origin = get_origin(headers) + org_url = f"{origin}/orgs/{org.slug}" + workflow_url = f"{org_url}/workflows/{cid}/latest" + + await self._send_encrypted( + receiver_email, + "crawlAutoPaused", + org_name=org.name, + user_name=user_name, + paused_reason=paused_reason, + paused_expiry=paused_expiry.isoformat(), + org_url=org_url, + workflow_url=workflow_url, + support_email=self.support_email, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 8d1683a713..8d8e2910b2 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -237,12 +237,18 @@ class UserOrgInfoOut(BaseModel): ] RUNNING_STATES = get_args(TYPE_RUNNING_STATES) -TYPE_PAUSED_STATES = Literal[ - "paused", +TYPE_MANUALLY_PAUSED_STATES = Literal["paused"] + +TYPE_AUTO_PAUSED_STATES = Literal[ "paused_storage_quota_reached", "paused_time_quota_reached", "paused_org_readonly", ] + +TYPE_PAUSED_STATES = Literal[ + TYPE_MANUALLY_PAUSED_STATES, + TYPE_AUTO_PAUSED_STATES, +] PAUSED_STATES = get_args(TYPE_PAUSED_STATES) TYPE_WAITING_NOT_PAUSED_STATES = Literal[ diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 591d9ff290..3c44d32e05 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1628,6 +1628,14 @@ async def update_crawl_state( allowed_from=RUNNING_AND_WAITING_STATES, ) + # TODO: This is reached several times, so make it idempotent + if paused_state != "paused": + await self.crawl_ops.notify_org_admins_of_auto_paused_crawl( + paused_reason=paused_state, + cid=crawl.cid, + org=crawl.org, + ) + # Add size of uploaded WACZ from paused crawl to org so that # the org knows it's over its storage quota # TODO: This is reached several times, so make it idempotent diff --git a/emails/emails/crawl-auto-paused.tsx b/emails/emails/crawl-auto-paused.tsx new file mode 100644 index 0000000000..a770bebe8e --- /dev/null +++ b/emails/emails/crawl-auto-paused.tsx @@ -0,0 +1,181 @@ +import { Link, Text } from "@react-email/components"; + +import { Template } from "../templates/btrix.js"; +import { + differenceInDays, + formatDate, + formatRelativeDate, + formatRelativeDateToParts, + offsetDays, +} from "../lib/date.js"; +import { Warning } from "../components/warning.js"; + +import { z } from "zod"; +import { trimTrailingSlash } from "../lib/url.js"; + + +export const schema = z.object({ + user_name: z.string(), + org_name: z.string(), + paused_reason: z.enum(["paused_storage_quota_reached", "paused_time_quota_reached", "paused_org_readonly"]), + paused_expiry: z.coerce.date(), + workflow_url: z.url().transform(trimTrailingSlash), + org_url: z.url().transform(trimTrailingSlash), + support_email: z.email().optional(), +}); + +export type CrawlAutoPausedEmailProps = z.infer; + +export const CrawlAutoPausedEmail = ({ + user_name, + org_name, + paused_reason, + paused_expiry, + workflow_url, + org_url, + support_email +}: CrawlAutoPausedEmailProps) => { + const date = formatDate(paused_expiry); + const daysLeft = differenceInDays(new Date(paused_expiry)); + const relative = formatRelativeDate(daysLeft, "days"); + const relativeParts = formatRelativeDateToParts(daysLeft, "days"); + return ( + + ); +}; + +CrawlAutoPausedEmail.PreviewProps = { + user_name: "Tessa", + org_name: "Tessa’s Archives", + paused_reason: "paused_storage_quota_reached", + paused_expiry: offsetDays(7), + workflow_url: "https://dev.browsertrix.com/orgs/default-org/workflows/d4a6cb18-eb54-4d25-a9e8-bb10a3eefa31/latest", + org_url: "https://dev.browsertrix.com/orgs/default-org", + support_email: "support@webrecorder.net", +} satisfies CrawlAutoPausedEmailProps; + +export default CrawlAutoPausedEmail; + +export const subject = () => "Your Browsertrix crawl was automatically paused"; diff --git a/emails/emails/index.ts b/emails/emails/index.ts index 10c71c0f87..cd619ac15a 100644 --- a/emails/emails/index.ts +++ b/emails/emails/index.ts @@ -4,3 +4,4 @@ export * as subscriptionCancel from "./subscription-cancel.js"; export * as trialEndingSoon from "./trial-ending-soon.js"; export * as verifyEmail from "./verify-email.js"; export * as failedBgJob from "./failed-bg-job.js"; +export * as crawlAutoPaused from "./crawl-auto-paused.js"; From d12a296fbad91a68dc824700d5b4dcb853f34ac5 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 12 Nov 2025 16:03:31 -0500 Subject: [PATCH 11/28] Inc org bytes stored when crawl files are added, not at end of crawl --- backend/btrixcloud/operator/crawls.py | 36 ++++++--------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 3c44d32e05..a1b328b972 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1407,6 +1407,8 @@ async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis): crawl.db_crawl_id, crawl.is_qa, crawl_file, filecomplete.size ) + await self.org_ops.inc_org_bytes_stored(crawl.oid, filecomplete.size, "crawl") + # no replicas for QA for now if crawl.is_qa: return True @@ -1429,11 +1431,12 @@ async def is_crawl_stopping( if crawl.stopping: return "stopped_by_user" - # TODO: This is source of bug where status.stopReason is sometimes None while - # crawl.paused_at is (still?) set after pausing due to storage quota (maybe - # execution time quota too?), which results in state being incorrectly set - # to "paused" + # TODO: This is the source of a bug where status.stopReason is sometimes None + # while crawl.paused_at is set after pausing due to quotas being reached, + # which results in state being incorrectly set to "paused" # We still need something like this for manual pauses, however + # The real question is: why is status.stopReason sometimes None at the start + # the crawl sync in the first place?? if crawl.paused_at and status.stopReason not in PAUSED_STATES: return "paused" @@ -1636,19 +1639,6 @@ async def update_crawl_state( org=crawl.org, ) - # Add size of uploaded WACZ from paused crawl to org so that - # the org knows it's over its storage quota - # TODO: This is reached several times, so make it idempotent - # TODO: Should this be status.filesAddedSize or stats.size? - if paused_state == "paused_storage_quota_reached": - print( - f"Crawl paused for storage quota, adding size to org. status.filesAddedSize: {status.filesAddedSize}, stats.size: {stats.size}", - flush=True, - ) - # await self.org_ops.inc_org_bytes_stored( - # crawl.oid, status.filesAddedSize, "crawl" - # ) - return status # if at least one is done according to redis, consider crawl successful @@ -1733,13 +1723,6 @@ async def mark_finished( if state in SUCCESSFUL_STATES: await self.inc_crawl_complete_stats(crawl, finished) - else: - # TODO: Remove any already uploaded WACZ files (e.g. from - # paused crawls) from org storage count - # await self.org_ops.inc_org_bytes_stored( - # crawl.oid, -status.filesAddedSize, "crawl" - # ) - pass # Regular Crawl Finished if not crawl.is_qa: @@ -1766,11 +1749,6 @@ async def do_crawl_finished_tasks( if state in SUCCESSFUL_STATES and crawl.oid: await self.page_ops.set_archived_item_page_counts(crawl.id) - # TODO: Make sure WACZs from paused crawls that have already been - # added here aren't double-counted - await self.org_ops.inc_org_bytes_stored( - crawl.oid, status.filesAddedSize, "crawl" - ) await self.org_ops.set_last_crawl_finished(crawl.oid) await self.coll_ops.add_successful_crawl_to_collections( crawl.id, crawl.cid, crawl.oid From da75c09e54532e1e2878129d55e535b9cf92323d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 12 Nov 2025 17:03:49 -0500 Subject: [PATCH 12/28] More incremental storage work --- backend/btrixcloud/basecrawls.py | 3 ++- backend/btrixcloud/operator/crawls.py | 11 ++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 75f47a9d74..9eb5fcea47 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -430,7 +430,7 @@ async def delete_failed_crawl_files(self, crawl_id: str, oid: UUID): """Delete crawl files for failed crawl""" crawl = await self.get_base_crawl(crawl_id) org = await self.orgs.get_org_by_id(oid) - await self._delete_crawl_files(crawl, org) + deleted_size = await self._delete_crawl_files(crawl, org) await self.crawls.find_one_and_update( {"_id": crawl_id, "oid": oid}, { @@ -441,6 +441,7 @@ async def delete_failed_crawl_files(self, crawl_id: str, oid: UUID): } }, ) + return deleted_size async def delete_all_crawl_qa_files(self, crawl_id: str, org: Organization): """Delete files for all qa runs in a crawl""" diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index a1b328b972..48580b0ee5 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1467,6 +1467,8 @@ async def is_crawl_stopping( active_crawls_total_size = await self.crawl_ops.get_active_crawls_size( crawl.oid ) + # TODO: Make sure this doesn't double-count paused crawl WACZs + # that have already been added to org storage if self.org_ops.storage_quota_reached(org, active_crawls_total_size): await self.pause_crawl(crawl, org) return "paused_storage_quota_reached" @@ -1764,7 +1766,14 @@ async def do_crawl_finished_tasks( ) if state in FAILED_STATES: - await self.crawl_ops.delete_failed_crawl_files(crawl.id, crawl.oid) + deleted_file_size = await self.crawl_ops.delete_failed_crawl_files( + crawl.id, crawl.oid + ) + # Ensure we decrement org storage for any files that were already stored + # (e.g. when crawl was paused) + await self.org_ops.inc_org_bytes_stored( + crawl.oid, -deleted_file_size, "crawl" + ) await self.page_ops.delete_crawl_pages(crawl.id, crawl.oid) await self.event_webhook_ops.create_crawl_finished_notification( From b609ccd82021e0a2055418f97f27b77b56945036 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 12 Nov 2025 17:13:40 -0500 Subject: [PATCH 13/28] One more TODO --- backend/btrixcloud/operator/crawls.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 48580b0ee5..ca6e209a7b 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1529,6 +1529,8 @@ async def update_crawl_state( stats, sizes = await self.get_redis_crawl_stats(redis, crawl.id) # need to add size of previously completed WACZ files as well! + # TODO: This seems to be making the crawls seem larger than they + # are in the frontend - need to untangle that stats.size += status.filesAddedSize # update status From 7e93e93c68db9079d765f0c932a0080ebdb5e911 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 13 Nov 2025 10:16:01 -0500 Subject: [PATCH 14/28] Move paused with no stop reason condition below quota checks --- backend/btrixcloud/operator/crawls.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index ca6e209a7b..5f77ec4956 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1431,15 +1431,6 @@ async def is_crawl_stopping( if crawl.stopping: return "stopped_by_user" - # TODO: This is the source of a bug where status.stopReason is sometimes None - # while crawl.paused_at is set after pausing due to quotas being reached, - # which results in state being incorrectly set to "paused" - # We still need something like this for manual pauses, however - # The real question is: why is status.stopReason sometimes None at the start - # the crawl sync in the first place?? - if crawl.paused_at and status.stopReason not in PAUSED_STATES: - return "paused" - # check timeout if timeout time exceeds elapsed time if crawl.timeout: elapsed = status.elapsedCrawlTime @@ -1478,6 +1469,9 @@ async def is_crawl_stopping( await self.pause_crawl(crawl, org) return "paused_time_quota_reached" + if crawl.paused_at and status.stopReason not in PAUSED_STATES: + return "paused" + return None async def pause_crawl(self, crawl: CrawlSpec, org: Organization): From b986d3cde66087fbc5bccd632b79f8aeac03cd5c Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 13 Nov 2025 14:09:31 -0500 Subject: [PATCH 15/28] WIP: Don't double-count already-uploaded WACZs in check Needs to be tested, just pushing as-is so that I can pick it up next week. There's an issue in local testing where crawls sometimes appear to be twice as big as they really are, which is making Browsertrix think the storage quota is reached prematurely. I haven't yet pinned down the cause of this and it seems intermittent. --- backend/btrixcloud/crawls.py | 14 +++++++++++++ backend/btrixcloud/operator/crawls.py | 29 ++++++++++++++++++--------- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index bdc1dacc12..b962cd19d2 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -379,6 +379,20 @@ async def get_active_crawls_size(self, oid: UUID) -> int: return results[0].get("totalSum") or 0 + async def get_active_crawls_uploaded_wacz_size(self, oid: UUID) -> int: + """get size of all waczs already uploaded for (e.g. previously or currently paused) crawls""" + cursor = self.crawls.aggregate( + [ + {"$match": {"state": {"$in": RUNNING_AND_WAITING_STATES}, "oid": oid}}, + {"$group": {"_id": None, "totalSum": {"$sum": "$fileSize"}}}, + ] + ) + results = await cursor.to_list(length=1) + if not results: + return 0 + + return results[0].get("totalSum") or 0 + async def delete_crawls( self, org: Organization, diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 5f77ec4956..0bdbb36c7e 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -200,11 +200,6 @@ async def sync_crawls(self, data: MCSyncData): seed_file_url=spec.get("seedFileUrl", ""), ) - print( - f"sync_crawls starting - crawl id: {crawl_id}, paused_at: {crawl.paused_at}, stopReason: {status.stopReason}", - flush=True, - ) - # if finalizing, crawl is being deleted if data.finalizing: if not status.finished: @@ -1455,12 +1450,28 @@ async def is_crawl_stopping( # pause crawl if storage quota is reached if org.quotas.storageQuota: + # Make sure to account for already-uploaded WACZs from active crawls + # that are or previously were paused, which are already accounted for + # in the org storage stats active_crawls_total_size = await self.crawl_ops.get_active_crawls_size( crawl.oid ) - # TODO: Make sure this doesn't double-count paused crawl WACZs - # that have already been added to org storage - if self.org_ops.storage_quota_reached(org, active_crawls_total_size): + print(f"Active crawls total size: {active_crawls_total_size}", flush=True) + already_uploaded_size = ( + await self.crawl_ops.get_active_crawls_uploaded_wacz_size(crawl.oid) + ) + print( + f"Active crawls already uploaded size: {already_uploaded_size}", + flush=True, + ) + active_crawls_not_uploaded_size = ( + active_crawls_total_size - already_uploaded_size + ) + print( + f"Active crawls not yet uploaded size: {active_crawls_not_uploaded_size}", + flush=True, + ) + if self.org_ops.storage_quota_reached(org, active_crawls_not_uploaded_size): await self.pause_crawl(crawl, org) return "paused_storage_quota_reached" @@ -1523,8 +1534,6 @@ async def update_crawl_state( stats, sizes = await self.get_redis_crawl_stats(redis, crawl.id) # need to add size of previously completed WACZ files as well! - # TODO: This seems to be making the crawls seem larger than they - # are in the frontend - need to untangle that stats.size += status.filesAddedSize # update status From 4bdb2e9b4801d7b1e343defcbb0fa12e0acdb1a0 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 17 Nov 2025 22:15:34 -0500 Subject: [PATCH 16/28] Decrement org in delete_failed_crawl_files --- backend/btrixcloud/basecrawls.py | 4 ++-- backend/btrixcloud/operator/crawls.py | 9 +-------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 9eb5fcea47..398ce16daa 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -430,7 +430,7 @@ async def delete_failed_crawl_files(self, crawl_id: str, oid: UUID): """Delete crawl files for failed crawl""" crawl = await self.get_base_crawl(crawl_id) org = await self.orgs.get_org_by_id(oid) - deleted_size = await self._delete_crawl_files(crawl, org) + deleted_file_size = await self._delete_crawl_files(crawl, org) await self.crawls.find_one_and_update( {"_id": crawl_id, "oid": oid}, { @@ -441,7 +441,7 @@ async def delete_failed_crawl_files(self, crawl_id: str, oid: UUID): } }, ) - return deleted_size + await self.orgs.inc_org_bytes_stored(oid, -deleted_file_size, "crawl") async def delete_all_crawl_qa_files(self, crawl_id: str, org: Organization): """Delete files for all qa runs in a crawl""" diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 0bdbb36c7e..82e557e0c0 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1771,14 +1771,7 @@ async def do_crawl_finished_tasks( ) if state in FAILED_STATES: - deleted_file_size = await self.crawl_ops.delete_failed_crawl_files( - crawl.id, crawl.oid - ) - # Ensure we decrement org storage for any files that were already stored - # (e.g. when crawl was paused) - await self.org_ops.inc_org_bytes_stored( - crawl.oid, -deleted_file_size, "crawl" - ) + await self.crawl_ops.delete_failed_crawl_files(crawl.id, crawl.oid) await self.page_ops.delete_crawl_pages(crawl.id, crawl.oid) await self.event_webhook_ops.create_crawl_finished_notification( From 5089a4951c3bed4f202932d31d05ae344f5d4e5a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 17 Nov 2025 22:28:35 -0500 Subject: [PATCH 17/28] Shorten docstring --- backend/btrixcloud/crawls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index b962cd19d2..eafde0c484 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -380,7 +380,7 @@ async def get_active_crawls_size(self, oid: UUID) -> int: return results[0].get("totalSum") or 0 async def get_active_crawls_uploaded_wacz_size(self, oid: UUID) -> int: - """get size of all waczs already uploaded for (e.g. previously or currently paused) crawls""" + """get size of all waczs already uploaded for running/paused crawls""" cursor = self.crawls.aggregate( [ {"$match": {"state": {"$in": RUNNING_AND_WAITING_STATES}, "oid": oid}}, From c8ca9b417c29ed1cf7be40a6644a182021c71eda Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 17 Nov 2025 22:55:08 -0500 Subject: [PATCH 18/28] Fix email sending (but still not yet idempotent) --- backend/btrixcloud/crawls.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index eafde0c484..9f48ab0056 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -1227,12 +1227,17 @@ async def notify_org_admins_of_auto_paused_crawl( ): """Send email to all org admins about automatically paused crawl""" users = await self.orgs.get_users_for_org(org, UserRole.OWNER) - workflow = await self.crawl_configs.get_crawl_config(cid) + workflow = await self.crawl_configs.get_crawl_config_out(cid, org) await asyncio.gather( *[ self.user_manager.email.send_crawl_auto_paused( - user.email, paused_reason, workflow.lastCrawlPausedExpiry, cid, org + user.name, + user.email, + paused_reason, + workflow.lastCrawlPausedExpiry, + cid, + org, ) for user in users ] From 24669cef57a6191110ff414f3ee045dee5bb5400 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 17 Nov 2025 23:30:27 -0500 Subject: [PATCH 19/28] Only send auto-paused emails once --- backend/btrixcloud/models.py | 1 + backend/btrixcloud/operator/crawls.py | 8 ++++++-- backend/btrixcloud/operator/models.py | 3 +++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 8d8e2910b2..800d9cfb71 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -244,6 +244,7 @@ class UserOrgInfoOut(BaseModel): "paused_time_quota_reached", "paused_org_readonly", ] +AUTO_PAUSED_STATES = get_args(TYPE_AUTO_PAUSED_STATES) TYPE_PAUSED_STATES = Literal[ TYPE_MANUALLY_PAUSED_STATES, diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 82e557e0c0..e71ee1070e 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -22,6 +22,7 @@ TYPE_RUNNING_STATES, TYPE_ALL_CRAWL_STATES, TYPE_PAUSED_STATES, + AUTO_PAUSED_STATES, RUNNING_STATES, WAITING_STATES, RUNNING_AND_STARTING_ONLY, @@ -1638,13 +1639,16 @@ async def update_crawl_state( allowed_from=RUNNING_AND_WAITING_STATES, ) - # TODO: This is reached several times, so make it idempotent - if paused_state != "paused": + if ( + paused_state in AUTO_PAUSED_STATES + and not status.autoPausedEmailsSent + ): await self.crawl_ops.notify_org_admins_of_auto_paused_crawl( paused_reason=paused_state, cid=crawl.cid, org=crawl.org, ) + status.autoPausedEmailsSent = True return status diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 5abdbc896d..56275bdee1 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -257,3 +257,6 @@ class CrawlStatus(BaseModel): # last state last_state: TYPE_ALL_CRAWL_STATES = Field(default="starting", exclude=True) + + # email sent to org admins because crawl was auto-paused + autoPausedEmailsSent: bool = False From ca61bee846b65c13e3b9a83e5d12fbf50527e66a Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 18 Nov 2025 12:30:13 -0500 Subject: [PATCH 20/28] Add TODO to address already-existing bug that now matters more --- backend/btrixcloud/operator/crawls.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index e71ee1070e..463946d9b0 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1535,6 +1535,10 @@ async def update_crawl_state( stats, sizes = await self.get_redis_crawl_stats(redis, crawl.id) # need to add size of previously completed WACZ files as well! + # TODO: This sometimes results in the crawl's stats.size being + # twice as large as expected when pausing crawls, as stats.size + # is not necessarily decremented once WACZ files are uploaded + # This then can have a downstream effects on the storage quota check stats.size += status.filesAddedSize # update status From 502d1adfb32c370d1c42b3ccb546d40ec4036de0 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 24 Nov 2025 15:59:25 -0500 Subject: [PATCH 21/28] TEMP: Add print logging to help figure out bug --- backend/btrixcloud/operator/crawls.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 463946d9b0..9891527e11 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1534,6 +1534,13 @@ async def update_crawl_state( results = await redis.hgetall(f"{crawl.id}:status") stats, sizes = await self.get_redis_crawl_stats(redis, crawl.id) + print(f"crawl.paused_at: {crawl.paused_at}", flush=True) + print(f"crawl.stopping: {crawl.stopping}", flush=True) + print(f"status.stopReason: {status.stopReason}", flush=True) + + print(f"stats.size initial: {stats.size}", flush=True) + print(f"status.filesAddedSize: {status.filesAddedSize}", flush=True) + # need to add size of previously completed WACZ files as well! # TODO: This sometimes results in the crawl's stats.size being # twice as large as expected when pausing crawls, as stats.size @@ -1541,6 +1548,8 @@ async def update_crawl_state( # This then can have a downstream effects on the storage quota check stats.size += status.filesAddedSize + print(f"stats.size after adding filesAddedSize: {stats.size}", flush=True) + # update status status.pagesDone = stats.done status.pagesFound = stats.found From 2787eca5f1336bc13fd6ae6320b372be6409b522 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 24 Nov 2025 16:45:53 -0500 Subject: [PATCH 22/28] Semi-solution with comments describing why it's not perfect --- backend/btrixcloud/operator/crawls.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 9891527e11..1292537880 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1539,16 +1539,27 @@ async def update_crawl_state( print(f"status.stopReason: {status.stopReason}", flush=True) print(f"stats.size initial: {stats.size}", flush=True) + print(f"status.filesAdded: {status.filesAdded}", flush=True) print(f"status.filesAddedSize: {status.filesAddedSize}", flush=True) # need to add size of previously completed WACZ files as well! - # TODO: This sometimes results in the crawl's stats.size being - # twice as large as expected when pausing crawls, as stats.size - # is not necessarily decremented once WACZ files are uploaded - # This then can have a downstream effects on the storage quota check - stats.size += status.filesAddedSize - - print(f"stats.size after adding filesAddedSize: {stats.size}", flush=True) + # TODO: Fix this so that it works as expected with pausing + # - The if clause here is close to a solution except it still results + # in pauses after the first showing a smaller-than-expected size + # because it no longer counts files added previous to resuming the crawl. + # - Kind of seems like what we need here is either a way of still adding + # files added prior to the current pause without double-adding files + # that are currently being uploaded. + # - Another way to do that might be to have the crawler decrement the size + # of a crawl by the amount of WACZs that are uploaded, so that this here + # in the operator can stay simpler? + if status.stopReason not in PAUSED_STATES: + stats.size += status.filesAddedSize + print(f"stats.size after adding filesAddedSize: {stats.size}", flush=True) + else: + print( + "not adding filesAddedSize to stats.size, crawl is pausing", flush=True + ) # update status status.pagesDone = stats.done From 1e3df5f5437335f7b39b1a5fafae2570696f60db Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 25 Nov 2025 08:11:39 -0800 Subject: [PATCH 23/28] =?UTF-8?q?refactor=20to=20add=20'pendingSize'=20to?= =?UTF-8?q?=20crawl=20which=20unambiguously=20stores=20the=E2=80=A6=20(#30?= =?UTF-8?q?13)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … pending, un-uploaded size - use pending size to determine if quota reached - also request pause to be set before assuming paused state - also ensure data is actually committed before shutting down pods (in case of any edge cases) - clear paused flag in redis after crawler pods shutdown - add OpCrawlStats to avoid adding unnecessary profile_update to public API this assumes changes in crawler to support: clearing size after WACZ upload, ensure upload happens if pod starts when crawl is paused --------- Co-authored-by: Tessa Walsh --- backend/btrixcloud/crawls.py | 26 ++--- backend/btrixcloud/models.py | 5 +- backend/btrixcloud/operator/crawls.py | 137 ++++++++++++-------------- backend/btrixcloud/operator/models.py | 17 +++- 4 files changed, 90 insertions(+), 95 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 9f48ab0056..78d18d6b1c 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -370,21 +370,7 @@ async def get_active_crawls_size(self, oid: UUID) -> int: cursor = self.crawls.aggregate( [ {"$match": {"state": {"$in": RUNNING_AND_WAITING_STATES}, "oid": oid}}, - {"$group": {"_id": None, "totalSum": {"$sum": "$stats.size"}}}, - ] - ) - results = await cursor.to_list(length=1) - if not results: - return 0 - - return results[0].get("totalSum") or 0 - - async def get_active_crawls_uploaded_wacz_size(self, oid: UUID) -> int: - """get size of all waczs already uploaded for running/paused crawls""" - cursor = self.crawls.aggregate( - [ - {"$match": {"state": {"$in": RUNNING_AND_WAITING_STATES}, "oid": oid}}, - {"$group": {"_id": None, "totalSum": {"$sum": "$fileSize"}}}, + {"$group": {"_id": None, "totalSum": {"$sum": "$pendingSize"}}}, ] ) results = await cursor.to_list(length=1) @@ -669,14 +655,16 @@ async def update_crawl_state_if_allowed( return res is not None async def update_running_crawl_stats( - self, crawl_id: str, is_qa: bool, stats: CrawlStats + self, crawl_id: str, is_qa: bool, stats: CrawlStats, pending_size: int ) -> bool: """update running crawl stats""" prefix = "" if not is_qa else "qa." query = {"_id": crawl_id, "type": "crawl", f"{prefix}state": "running"} - res = await self.crawls.find_one_and_update( - query, {"$set": {f"{prefix}stats": stats.dict()}} - ) + update: dict[str, dict | int] = {f"{prefix}stats": stats.dict()} + if not is_qa: + update["pendingSize"] = pending_size + + res = await self.crawls.find_one_and_update(query, {"$set": update}) return res is not None async def inc_crawl_exec_time( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 800d9cfb71..4961c56e0d 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -304,8 +304,6 @@ class CrawlStats(BaseModel): done: int = 0 size: int = 0 - profile_update: Optional[str] = "" - # ============================================================================ @@ -907,6 +905,7 @@ class CrawlOut(BaseMongoModel): fileSize: int = 0 fileCount: int = 0 + pendingSize: int = 0 tags: Optional[List[str]] = [] @@ -1091,6 +1090,8 @@ class Crawl(BaseCrawl, CrawlConfigCore): qa: Optional[QARun] = None qaFinished: Optional[Dict[str, QARun]] = {} + pendingSize: int = 0 + # ============================================================================ class CrawlCompleteIn(BaseModel): diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 1292537880..33cabec192 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -22,7 +22,6 @@ TYPE_RUNNING_STATES, TYPE_ALL_CRAWL_STATES, TYPE_PAUSED_STATES, - AUTO_PAUSED_STATES, RUNNING_STATES, WAITING_STATES, RUNNING_AND_STARTING_ONLY, @@ -30,11 +29,9 @@ SUCCESSFUL_STATES, FAILED_STATES, PAUSED_STATES, - CrawlStats, CrawlFile, CrawlCompleteIn, StorageRef, - Organization, ) from btrixcloud.utils import ( @@ -48,6 +45,7 @@ from .models import ( CrawlSpec, CrawlStatus, + OpCrawlStats, StopReason, MCBaseRequest, MCSyncData, @@ -398,7 +396,13 @@ async def sync_crawls(self, data: MCSyncData): if status.pagesFound < status.desiredScale: status.desiredScale = max(1, status.pagesFound) - is_paused = bool(crawl.paused_at) and status.state in PAUSED_STATES + # paused and shut down pods if size is <= 4096 (empty dir) + # paused_at is set state is a valid paused state + is_paused = ( + bool(crawl.paused_at) + and status.sizePending <= 4096 + and status.state in PAUSED_STATES + ) for i in range(0, status.desiredScale): if status.pagesFound < i * num_browsers_per_pod: @@ -686,7 +690,7 @@ async def set_state( crawl: CrawlSpec, allowed_from: Sequence[TYPE_ALL_CRAWL_STATES], finished: Optional[datetime] = None, - stats: Optional[CrawlStats] = None, + stats: Optional[OpCrawlStats] = None, ): """set status state and update db, if changed if allowed_from passed in, can only transition from allowed_from state, @@ -837,7 +841,7 @@ async def fail_crawl( crawl: CrawlSpec, status: CrawlStatus, pods: dict, - stats: CrawlStats, + stats: OpCrawlStats, redis: Redis, ) -> bool: """Mark crawl as failed, log crawl state and print crawl logs, if possible""" @@ -980,6 +984,10 @@ async def sync_crawl_state( ) if not crawler_running and redis: + # clear paused key now so can resume + if crawl.paused_at: + await redis.delete(f"{crawl.id}:paused") + # if crawler is not running for REDIS_TTL seconds, also stop redis # but not right away in case crawler pod is just restarting. # avoids keeping redis pods around while no crawler pods are up @@ -1006,12 +1014,12 @@ async def sync_crawl_state( status.lastActiveTime = date_to_str(dt_now()) file_done = await redis.rpop(self.done_key) + while file_done: msg = json.loads(file_done) # add completed file if msg.get("filename"): await self.add_file_to_crawl(msg, crawl, redis) - await redis.incr("filesAdded") # get next file done file_done = await redis.rpop(self.done_key) @@ -1381,7 +1389,7 @@ def get_log_line(self, message, details): } return json.dumps(err) - async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis): + async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis) -> int: """Handle finished CrawlFile to db""" filecomplete = CrawlCompleteIn(**cc_data) @@ -1398,6 +1406,11 @@ async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis): ) await redis.incr("filesAddedSize", filecomplete.size) + await redis.incr("filesAdded") + + # sizes = await redis.hkeys(f"{crawl.id}:size") + # for size in sizes: + # await redis.hmset(f"{crawl.id}:size", {size: 0 for size in sizes}) await self.crawl_ops.add_crawl_file( crawl.db_crawl_id, crawl.is_qa, crawl_file, filecomplete.size @@ -1407,7 +1420,7 @@ async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis): # no replicas for QA for now if crawl.is_qa: - return True + return filecomplete.size try: await self.background_job_ops.create_replica_jobs( @@ -1417,7 +1430,7 @@ async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis): except Exception as exc: print("Replicate Exception", exc, flush=True) - return True + return filecomplete.size async def is_crawl_stopping( self, crawl: CrawlSpec, status: CrawlStatus @@ -1446,8 +1459,7 @@ async def is_crawl_stopping( # pause crawl if org is set read-only if org.readOnly: - await self.pause_crawl(crawl, org) - return "paused_org_readonly" + return self.request_pause_crawl("paused_org_readonly", crawl) # pause crawl if storage quota is reached if org.quotas.storageQuota: @@ -1457,44 +1469,35 @@ async def is_crawl_stopping( active_crawls_total_size = await self.crawl_ops.get_active_crawls_size( crawl.oid ) - print(f"Active crawls total size: {active_crawls_total_size}", flush=True) - already_uploaded_size = ( - await self.crawl_ops.get_active_crawls_uploaded_wacz_size(crawl.oid) - ) - print( - f"Active crawls already uploaded size: {already_uploaded_size}", - flush=True, - ) - active_crawls_not_uploaded_size = ( - active_crawls_total_size - already_uploaded_size - ) - print( - f"Active crawls not yet uploaded size: {active_crawls_not_uploaded_size}", - flush=True, - ) - if self.org_ops.storage_quota_reached(org, active_crawls_not_uploaded_size): - await self.pause_crawl(crawl, org) - return "paused_storage_quota_reached" + + if self.org_ops.storage_quota_reached(org, active_crawls_total_size): + return self.request_pause_crawl("paused_storage_quota_reached", crawl) # pause crawl if execution time quota is reached if self.org_ops.exec_mins_quota_reached(org): - await self.pause_crawl(crawl, org) - return "paused_time_quota_reached" + return self.request_pause_crawl("paused_time_quota_reached", crawl) if crawl.paused_at and status.stopReason not in PAUSED_STATES: return "paused" return None - async def pause_crawl(self, crawl: CrawlSpec, org: Organization): - """Pause crawl and update crawl spec""" - paused_at = dt_now() - await self.crawl_ops.pause_crawl(crawl.id, org, pause=True, paused_at=paused_at) - crawl.paused_at = paused_at + def request_pause_crawl( + self, reason: StopReason, crawl: CrawlSpec + ) -> Optional[StopReason]: + """Request crawl to be paused asynchronously, equivalent of user clicking 'pause' button + if crawl is paused, then use the specified reason instead of default paused state + """ + if crawl.paused_at: + return reason + + print(f"request pause for {reason}") + self.run_task(self.crawl_ops.pause_crawl(crawl.id, crawl.org, pause=True)) + return None async def get_redis_crawl_stats( self, redis: Redis, crawl_id: str - ) -> tuple[CrawlStats, dict[str, Any]]: + ) -> tuple[OpCrawlStats, dict[str, Any]]: """get page stats""" try: # crawler >0.9.0, done key is a value @@ -1514,7 +1517,7 @@ async def get_redis_crawl_stats( profile_update = await redis.get(f"{crawl_id}:profileUploaded") - stats = CrawlStats( + stats = OpCrawlStats( found=pages_found, done=pages_done, size=archive_size, @@ -1534,45 +1537,36 @@ async def update_crawl_state( results = await redis.hgetall(f"{crawl.id}:status") stats, sizes = await self.get_redis_crawl_stats(redis, crawl.id) - print(f"crawl.paused_at: {crawl.paused_at}", flush=True) - print(f"crawl.stopping: {crawl.stopping}", flush=True) - print(f"status.stopReason: {status.stopReason}", flush=True) + pending_size = stats.size - print(f"stats.size initial: {stats.size}", flush=True) + stats.size += status.filesAddedSize + + total_size = stats.size + + print(f"pending size: {pending_size}", flush=True) print(f"status.filesAdded: {status.filesAdded}", flush=True) print(f"status.filesAddedSize: {status.filesAddedSize}", flush=True) - - # need to add size of previously completed WACZ files as well! - # TODO: Fix this so that it works as expected with pausing - # - The if clause here is close to a solution except it still results - # in pauses after the first showing a smaller-than-expected size - # because it no longer counts files added previous to resuming the crawl. - # - Kind of seems like what we need here is either a way of still adding - # files added prior to the current pause without double-adding files - # that are currently being uploaded. - # - Another way to do that might be to have the crawler decrement the size - # of a crawl by the amount of WACZs that are uploaded, so that this here - # in the operator can stay simpler? - if status.stopReason not in PAUSED_STATES: - stats.size += status.filesAddedSize - print(f"stats.size after adding filesAddedSize: {stats.size}", flush=True) - else: - print( - "not adding filesAddedSize to stats.size, crawl is pausing", flush=True - ) + print(f"total: {total_size}", flush=True) + print( + f"org quota: {crawl.org.bytesStored + stats.size} <= {crawl.org.quotas.storageQuota}", + flush=True, + ) # update status status.pagesDone = stats.done status.pagesFound = stats.found - status.size = stats.size + + status.sizePending = pending_size + status.size = total_size status.sizeHuman = humanize.naturalsize(status.size) await self.crawl_ops.update_running_crawl_stats( - crawl.db_crawl_id, crawl.is_qa, stats + crawl.db_crawl_id, crawl.is_qa, stats, pending_size ) for key, value in sizes.items(): increase_storage = False + pod_info = None value = int(value) if value > 0 and status.podStatus: pod_info = status.podStatus[key] @@ -1588,11 +1582,11 @@ async def update_crawl_state( increase_storage = True # out of storage - if pod_info.isNewExit and pod_info.exitCode == 3: + if pod_info and pod_info.isNewExit and pod_info.exitCode == 3: pod_info.used.storage = pod_info.allocated.storage increase_storage = True - if increase_storage: + if pod_info and increase_storage: new_storage = math.ceil( pod_info.used.storage * self.min_avail_storage_ratio / 1_000_000_000 ) @@ -1655,7 +1649,7 @@ async def update_crawl_state( else: paused_state = "paused" - await redis.delete(f"{crawl.id}:paused") + # await redis.delete(f"{crawl.id}:paused") await self.set_state( paused_state, status, @@ -1663,10 +1657,7 @@ async def update_crawl_state( allowed_from=RUNNING_AND_WAITING_STATES, ) - if ( - paused_state in AUTO_PAUSED_STATES - and not status.autoPausedEmailsSent - ): + if paused_state != "paused" and not status.autoPausedEmailsSent: await self.crawl_ops.notify_org_admins_of_auto_paused_crawl( paused_reason=paused_state, cid=crawl.cid, @@ -1731,7 +1722,7 @@ async def mark_finished( crawl: CrawlSpec, status: CrawlStatus, state: TYPE_NON_RUNNING_STATES, - stats: Optional[CrawlStats] = None, + stats: Optional[OpCrawlStats] = None, ) -> bool: """mark crawl as finished, set finished timestamp and final state""" @@ -1775,7 +1766,7 @@ async def do_crawl_finished_tasks( crawl: CrawlSpec, status: CrawlStatus, state: TYPE_NON_RUNNING_STATES, - stats: Optional[CrawlStats], + stats: Optional[OpCrawlStats], ) -> None: """Run tasks after crawl completes in asyncio.task coroutine.""" await self.crawl_config_ops.stats_recompute_last( diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 56275bdee1..60099ee857 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -6,7 +6,12 @@ from typing import Optional, DefaultDict, Literal, Annotated, Any from pydantic import BaseModel, Field from kubernetes.utils import parse_quantity -from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES, Organization +from btrixcloud.models import ( + StorageRef, + TYPE_ALL_CRAWL_STATES, + Organization, + CrawlStats, +) BTRIX_API = "btrix.cloud/v1" @@ -203,6 +208,13 @@ def should_restart_pod(self, forced: bool = False) -> Optional[str]: return None +# ============================================================================ +class OpCrawlStats(CrawlStats): + """crawl stats + internal profile update""" + + profile_update: Optional[str] = "" + + # ============================================================================ # pylint: disable=invalid-name class CrawlStatus(BaseModel): @@ -215,6 +227,9 @@ class CrawlStatus(BaseModel): # human readable size string sizeHuman: str = "" + # pending size (not uploaded) + sizePending: int = 0 + # actual observed scale (number of pods active) scale: int = 0 # desired scale as computed by crawl state (number of pods that should be active) From fb6428d42da1641792333d3c10e4e016aec81296 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 25 Nov 2025 11:49:47 -0500 Subject: [PATCH 24/28] Small tweaks --- backend/btrixcloud/crawls.py | 4 ++-- backend/btrixcloud/operator/crawls.py | 15 ++++++--------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 78d18d6b1c..89a1cd44fc 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -365,8 +365,8 @@ async def get_active_crawls(self, oid: UUID, limit: int) -> list[str]: res_list = await res.to_list() return [res["_id"] for res in res_list] - async def get_active_crawls_size(self, oid: UUID) -> int: - """get size of all active (running, waiting, paused) crawls""" + async def get_active_crawls_pending_size(self, oid: UUID) -> int: + """get pending size of all active (running, waiting, paused) crawls""" cursor = self.crawls.aggregate( [ {"$match": {"state": {"$in": RUNNING_AND_WAITING_STATES}, "oid": oid}}, diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 33cabec192..56253119df 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -396,8 +396,8 @@ async def sync_crawls(self, data: MCSyncData): if status.pagesFound < status.desiredScale: status.desiredScale = max(1, status.pagesFound) - # paused and shut down pods if size is <= 4096 (empty dir) - # paused_at is set state is a valid paused state + # paused and shut down pods if size is <= 4096 (empty dir), + # paused_at is set, and state is a valid paused state is_paused = ( bool(crawl.paused_at) and status.sizePending <= 4096 @@ -1463,14 +1463,11 @@ async def is_crawl_stopping( # pause crawl if storage quota is reached if org.quotas.storageQuota: - # Make sure to account for already-uploaded WACZs from active crawls - # that are or previously were paused, which are already accounted for - # in the org storage stats - active_crawls_total_size = await self.crawl_ops.get_active_crawls_size( - crawl.oid + # include not-yet-uploaded pending data from all active crawls + active_crawls_pending_size = ( + await self.crawl_ops.get_active_crawls_pending_size(crawl.oid) ) - - if self.org_ops.storage_quota_reached(org, active_crawls_total_size): + if self.org_ops.storage_quota_reached(org, active_crawls_pending_size): return self.request_pause_crawl("paused_storage_quota_reached", crawl) # pause crawl if execution time quota is reached From 2cb30946b89c02dc7f13eb644f3f629ba5dec7e5 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 26 Nov 2025 12:34:31 -0500 Subject: [PATCH 25/28] Track autoPausedEmailsSent state in db instead of crawl state This is much more reliable, prevents duplicate emails as was sometimes happening before, and makes it easier to clear the state when a crawl is unpaused. --- backend/btrixcloud/crawls.py | 29 +++++++++++++++++++++++++++ backend/btrixcloud/models.py | 2 ++ backend/btrixcloud/operator/crawls.py | 9 +++++++-- backend/btrixcloud/operator/models.py | 3 --- 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 89a1cd44fc..66184afac2 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -838,6 +838,11 @@ async def pause_crawl( if pause and not paused_at: paused_at = dt_now() + if not pause: + # If unpausing, unset autoPausedEmailsSent so that we will send + # emails again if quota is reached + await self.set_auto_paused_emails_sent(crawl_id, org, False) + try: result = await self.crawl_manager.pause_resume_crawl( crawl_id, paused_at=paused_at @@ -1210,6 +1215,7 @@ async def get_crawl_logs( async def notify_org_admins_of_auto_paused_crawl( self, paused_reason: TYPE_AUTO_PAUSED_STATES, + crawl_id: str, cid: UUID, org: Organization, ): @@ -1231,6 +1237,29 @@ async def notify_org_admins_of_auto_paused_crawl( ] ) + await self.set_auto_paused_emails_sent(crawl_id, org) + + async def set_auto_paused_emails_sent( + self, crawl_id: str, org: Organization, emails_sent: bool = True + ): + """Set if auto-paused emails already sent""" + await self.crawls.find_one_and_update( + {"_id": crawl_id, "oid": org.id, "type": "crawl"}, + {"$set": {"autoPausedEmailsSent": emails_sent}}, + ) + + async def get_auto_paused_emails_sent( + self, crawl_id: str, org: Organization + ) -> bool: + """Return whether auto-paused emails already sent for crawl""" + res = await self.crawls.find_one( + {"_id": crawl_id, "oid": org.id, "type": "crawl"}, + projection=["autoPausedEmailsSent"], + ) + if res: + return res.get("autoPausedEmailsSent", False) + return False + # ============================================================================ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str): diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 4961c56e0d..9ab01e317a 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1092,6 +1092,8 @@ class Crawl(BaseCrawl, CrawlConfigCore): pendingSize: int = 0 + autoPausedEmailsSent: bool = False + # ============================================================================ class CrawlCompleteIn(BaseModel): diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 56253119df..400913a300 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1654,13 +1654,18 @@ async def update_crawl_state( allowed_from=RUNNING_AND_WAITING_STATES, ) - if paused_state != "paused" and not status.autoPausedEmailsSent: + if ( + paused_state != "paused" + and not await self.crawl_ops.get_auto_paused_emails_sent( + crawl.id, crawl.org + ) + ): await self.crawl_ops.notify_org_admins_of_auto_paused_crawl( paused_reason=paused_state, + crawl_id=crawl.id, cid=crawl.cid, org=crawl.org, ) - status.autoPausedEmailsSent = True return status diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 60099ee857..446babe196 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -272,6 +272,3 @@ class CrawlStatus(BaseModel): # last state last_state: TYPE_ALL_CRAWL_STATES = Field(default="starting", exclude=True) - - # email sent to org admins because crawl was auto-paused - autoPausedEmailsSent: bool = False From 2aa8b9c6df6f5d7af1b4dace92b2b946796edf8e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 26 Nov 2025 12:58:16 -0500 Subject: [PATCH 26/28] Add isPaused crawler util function for frontend --- frontend/src/features/archived-items/crawl-status.ts | 12 +++--------- .../workflow-action-menu/workflow-action-menu.ts | 7 ++----- frontend/src/pages/org/workflow-detail.ts | 12 +++++------- frontend/src/utils/crawler.ts | 5 +++++ 4 files changed, 15 insertions(+), 21 deletions(-) diff --git a/frontend/src/features/archived-items/crawl-status.ts b/frontend/src/features/archived-items/crawl-status.ts index 0c2bac403f..9c7b70eb2a 100644 --- a/frontend/src/features/archived-items/crawl-status.ts +++ b/frontend/src/features/archived-items/crawl-status.ts @@ -5,11 +5,8 @@ import startCase from "lodash/fp/startCase"; import { TailwindElement } from "@/classes/TailwindElement"; import { labelWithIcon } from "@/layouts/labelWithIcon"; -import { - PAUSED_STATES, - RUNNING_STATES, - type CrawlState, -} from "@/types/crawlState"; +import { RUNNING_STATES, type CrawlState } from "@/types/crawlState"; +import { isPaused } from "@/utils/crawler"; import { animatePulse } from "@/utils/css"; type CrawlType = "crawl" | "qa"; @@ -377,10 +374,7 @@ export class CrawlStatus extends TailwindElement { ) { return "pausing"; } - if ( - !this.shouldPause && - (PAUSED_STATES as readonly string[]).includes(this.state || "") - ) { + if (!this.shouldPause && isPaused(this.state || "")) { return "resuming"; } return this.state; diff --git a/frontend/src/features/crawl-workflows/workflow-action-menu/workflow-action-menu.ts b/frontend/src/features/crawl-workflows/workflow-action-menu/workflow-action-menu.ts index 69e8cbc874..a93b1ddbf9 100644 --- a/frontend/src/features/crawl-workflows/workflow-action-menu/workflow-action-menu.ts +++ b/frontend/src/features/crawl-workflows/workflow-action-menu/workflow-action-menu.ts @@ -10,8 +10,7 @@ import { BtrixElement } from "@/classes/BtrixElement"; import { ClipboardController } from "@/controllers/clipboard"; import { WorkflowTab } from "@/routes"; import type { Crawl, ListWorkflow, Workflow } from "@/types/crawler"; -import { PAUSED_STATES } from "@/types/crawlState"; -import { isNotFailed, isSuccessfullyFinished } from "@/utils/crawler"; +import { isNotFailed, isPaused, isSuccessfullyFinished } from "@/utils/crawler"; import { isArchivingDisabled } from "@/utils/orgs"; @customElement("btrix-workflow-action-menu") @@ -42,9 +41,7 @@ export class WorkflowActionMenu extends BtrixElement { const canCrawl = this.appState.isCrawler; const archivingDisabled = isArchivingDisabled(this.org, true); - const paused = (PAUSED_STATES as readonly string[]).includes( - workflow.lastCrawlState || "", - ); + const paused = isPaused(workflow.lastCrawlState || ""); const crawling = workflow.isCrawlRunning && !workflow.lastCrawlStopping && diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index 7fb7ccbadd..4b8a7673dc 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -32,13 +32,14 @@ import { pageNav, type Breadcrumb } from "@/layouts/pageHeader"; import { WorkflowTab } from "@/routes"; import { deleteConfirmation, noData, notApplicable } from "@/strings/ui"; import type { APIPaginatedList, APIPaginationQuery } from "@/types/api"; -import { PAUSED_STATES, type CrawlState } from "@/types/crawlState"; +import { type CrawlState } from "@/types/crawlState"; import { type StorageSeedFile } from "@/types/workflow"; import { isApiError } from "@/utils/api"; import { settingsForDuplicate } from "@/utils/crawl-workflows/settingsForDuplicate"; import { DEFAULT_MAX_SCALE, isActive, + isPaused, isSkipped, isSuccessfullyFinished, renderName, @@ -367,9 +368,7 @@ export class WorkflowDetail extends BtrixElement { } private get isPaused() { - return (PAUSED_STATES as readonly string[]).includes( - this.workflow?.lastCrawlState || "", - ); + return isPaused(this.workflow?.lastCrawlState || ""); } private get isResuming() { @@ -920,9 +919,8 @@ export class WorkflowDetail extends BtrixElement { return ( this.workflow.lastCrawlShouldPause === - !(PAUSED_STATES as readonly string[]).includes( - this.workflow.lastCrawlState || "", - ) || isLoading(this.pauseResumeTask) + !isPaused(this.workflow.lastCrawlState || "") || + isLoading(this.pauseResumeTask) ); } diff --git a/frontend/src/utils/crawler.ts b/frontend/src/utils/crawler.ts index a0cfa35eca..76be91de31 100644 --- a/frontend/src/utils/crawler.ts +++ b/frontend/src/utils/crawler.ts @@ -5,6 +5,7 @@ import { html, type TemplateResult } from "lit"; import type { ArchivedItem, Crawl, Upload, Workflow } from "@/types/crawler"; import { FAILED_STATES, + PAUSED_STATES, RUNNING_AND_WAITING_STATES, SUCCESSFUL_AND_FAILED_STATES, SUCCESSFUL_STATES, @@ -51,6 +52,10 @@ export function isNotFailed({ state }: { state: string | null }) { ); } +export function isPaused(state: string | null) { + return state && (PAUSED_STATES as readonly string[]).includes(state); +} + export function isPageScopeType( scope?: (typeof WorkflowScopeType)[keyof typeof WorkflowScopeType], ) { From 6aaf74c0edbec65e0f592662955aa017a172455d Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 26 Nov 2025 14:44:27 -0500 Subject: [PATCH 27/28] Check if email already sent from notify method --- backend/btrixcloud/crawls.py | 3 +++ backend/btrixcloud/operator/crawls.py | 7 +------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 66184afac2..eb273b5c7e 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -1220,6 +1220,9 @@ async def notify_org_admins_of_auto_paused_crawl( org: Organization, ): """Send email to all org admins about automatically paused crawl""" + if await self.get_auto_paused_emails_sent(crawl_id, org): + return + users = await self.orgs.get_users_for_org(org, UserRole.OWNER) workflow = await self.crawl_configs.get_crawl_config_out(cid, org) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 400913a300..01b718034d 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1654,12 +1654,7 @@ async def update_crawl_state( allowed_from=RUNNING_AND_WAITING_STATES, ) - if ( - paused_state != "paused" - and not await self.crawl_ops.get_auto_paused_emails_sent( - crawl.id, crawl.org - ) - ): + if paused_state != "paused": await self.crawl_ops.notify_org_admins_of_auto_paused_crawl( paused_reason=paused_state, crawl_id=crawl.id, From 97dd1483214cea94644a1ac673b4ac5bbece54d4 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 26 Nov 2025 17:25:37 -0500 Subject: [PATCH 28/28] Apply suggestions from code review for email Co-authored-by: Emma Segal-Grossman --- emails/emails/crawl-auto-paused.tsx | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/emails/emails/crawl-auto-paused.tsx b/emails/emails/crawl-auto-paused.tsx index a770bebe8e..58e28cf7ac 100644 --- a/emails/emails/crawl-auto-paused.tsx +++ b/emails/emails/crawl-auto-paused.tsx @@ -8,7 +8,6 @@ import { formatRelativeDateToParts, offsetDays, } from "../lib/date.js"; -import { Warning } from "../components/warning.js"; import { z } from "zod"; import { trimTrailingSlash } from "../lib/url.js"; @@ -35,9 +34,7 @@ export const CrawlAutoPausedEmail = ({ org_url, support_email }: CrawlAutoPausedEmailProps) => { - const date = formatDate(paused_expiry); const daysLeft = differenceInDays(new Date(paused_expiry)); - const relative = formatRelativeDate(daysLeft, "days"); const relativeParts = formatRelativeDateToParts(daysLeft, "days"); return (