diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 75f47a9d74..398ce16daa 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -430,7 +430,7 @@ async def delete_failed_crawl_files(self, crawl_id: str, oid: UUID): """Delete crawl files for failed crawl""" crawl = await self.get_base_crawl(crawl_id) org = await self.orgs.get_org_by_id(oid) - await self._delete_crawl_files(crawl, org) + deleted_file_size = await self._delete_crawl_files(crawl, org) await self.crawls.find_one_and_update( {"_id": crawl_id, "oid": oid}, { @@ -441,6 +441,7 @@ async def delete_failed_crawl_files(self, crawl_id: str, oid: UUID): } }, ) + await self.orgs.inc_org_bytes_stored(oid, -deleted_file_size, "crawl") async def delete_all_crawl_qa_files(self, crawl_id: str, org: Organization): """Delete files for all qa runs in a crawl""" diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index f1519cec14..eb273b5c7e 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -9,6 +9,7 @@ import urllib.parse from datetime import datetime from uuid import UUID +import asyncio from typing import ( Annotated, @@ -79,6 +80,8 @@ MatchCrawlQueueResponse, CrawlLogLine, TagsResponse, + TYPE_AUTO_PAUSED_STATES, + UserRole, ) @@ -93,7 +96,12 @@ class CrawlOps(BaseCrawlOps): crawl_manager: CrawlManager - def __init__(self, crawl_manager: CrawlManager, log_ops: CrawlLogOps, *args): + def __init__( + self, + crawl_manager: CrawlManager, + log_ops: CrawlLogOps, + *args, + ): super().__init__(*args) self.crawl_manager = crawl_manager self.log_ops = log_ops @@ -357,12 +365,12 @@ async def get_active_crawls(self, oid: UUID, limit: int) -> list[str]: res_list = await res.to_list() return [res["_id"] for res in res_list] - async def get_active_crawls_size(self, oid: UUID) -> int: - """get size of all active (running, waiting, paused) crawls""" + async def get_active_crawls_pending_size(self, oid: UUID) -> int: + """get pending size of all active (running, waiting, paused) crawls""" cursor = self.crawls.aggregate( [ {"$match": {"state": {"$in": RUNNING_AND_WAITING_STATES}, "oid": oid}}, - {"$group": {"_id": None, "totalSum": {"$sum": "$stats.size"}}}, + {"$group": {"_id": None, "totalSum": {"$sum": "$pendingSize"}}}, ] ) results = await cursor.to_list(length=1) @@ -647,14 +655,16 @@ async def update_crawl_state_if_allowed( return res is not None async def update_running_crawl_stats( - self, crawl_id: str, is_qa: bool, stats: CrawlStats + self, crawl_id: str, is_qa: bool, stats: CrawlStats, pending_size: int ) -> bool: """update running crawl stats""" prefix = "" if not is_qa else "qa." query = {"_id": crawl_id, "type": "crawl", f"{prefix}state": "running"} - res = await self.crawls.find_one_and_update( - query, {"$set": {f"{prefix}stats": stats.dict()}} - ) + update: dict[str, dict | int] = {f"{prefix}stats": stats.dict()} + if not is_qa: + update["pendingSize"] = pending_size + + res = await self.crawls.find_one_and_update(query, {"$set": update}) return res is not None async def inc_crawl_exec_time( @@ -812,7 +822,11 @@ async def get_crawl_stats( return crawls_data async def pause_crawl( - self, crawl_id: str, org: Organization, pause: bool + self, + crawl_id: str, + org: Organization, + pause: bool, + paused_at: Optional[datetime] = None, ) -> Dict[str, bool]: """pause or resume a crawl temporarily""" crawl = await self.get_base_crawl(crawl_id, org) @@ -821,10 +835,13 @@ async def pause_crawl( result = None - if pause: + if pause and not paused_at: paused_at = dt_now() - else: - paused_at = None + + if not pause: + # If unpausing, unset autoPausedEmailsSent so that we will send + # emails again if quota is reached + await self.set_auto_paused_emails_sent(crawl_id, org, False) try: result = await self.crawl_manager.pause_resume_crawl( @@ -1195,6 +1212,57 @@ async def get_crawl_logs( qa_run_id=qa_run_id, ) + async def notify_org_admins_of_auto_paused_crawl( + self, + paused_reason: TYPE_AUTO_PAUSED_STATES, + crawl_id: str, + cid: UUID, + org: Organization, + ): + """Send email to all org admins about automatically paused crawl""" + if await self.get_auto_paused_emails_sent(crawl_id, org): + return + + users = await self.orgs.get_users_for_org(org, UserRole.OWNER) + workflow = await self.crawl_configs.get_crawl_config_out(cid, org) + + await asyncio.gather( + *[ + self.user_manager.email.send_crawl_auto_paused( + user.name, + user.email, + paused_reason, + workflow.lastCrawlPausedExpiry, + cid, + org, + ) + for user in users + ] + ) + + await self.set_auto_paused_emails_sent(crawl_id, org) + + async def set_auto_paused_emails_sent( + self, crawl_id: str, org: Organization, emails_sent: bool = True + ): + """Set if auto-paused emails already sent""" + await self.crawls.find_one_and_update( + {"_id": crawl_id, "oid": org.id, "type": "crawl"}, + {"$set": {"autoPausedEmailsSent": emails_sent}}, + ) + + async def get_auto_paused_emails_sent( + self, crawl_id: str, org: Organization + ) -> bool: + """Return whether auto-paused emails already sent for crawl""" + res = await self.crawls.find_one( + {"_id": crawl_id, "oid": org.id, "type": "crawl"}, + projection=["autoPausedEmailsSent"], + ) + if res: + return res.get("autoPausedEmailsSent", False) + return False + # ============================================================================ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str): @@ -1217,7 +1285,11 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id: str): # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, too-many-statements def init_crawls_api( - crawl_manager: CrawlManager, crawl_log_ops: CrawlLogOps, app, user_dep, *args + crawl_manager: CrawlManager, + crawl_log_ops: CrawlLogOps, + app, + user_dep, + *args, ): """API for crawl management, including crawl done callback""" # pylint: disable=invalid-name, duplicate-code diff --git a/backend/btrixcloud/emailsender.py b/backend/btrixcloud/emailsender.py index aa148daf92..5b285e24ea 100644 --- a/backend/btrixcloud/emailsender.py +++ b/backend/btrixcloud/emailsender.py @@ -20,6 +20,7 @@ Organization, InvitePending, Subscription, + TYPE_AUTO_PAUSED_STATES, ) from .utils import is_bool, get_origin @@ -250,3 +251,31 @@ async def send_subscription_trial_ending_soon( behavior_on_trial_end=behavior_on_trial_end, support_email=self.support_email, ) + + async def send_crawl_auto_paused( + self, + user_name: str, + receiver_email: str, + paused_reason: TYPE_AUTO_PAUSED_STATES, + paused_expiry: datetime, + cid: UUID, + org: Organization, + headers=None, + ): + """Send email indicating crawl was paused due to quota or disabled crawling""" + + origin = get_origin(headers) + org_url = f"{origin}/orgs/{org.slug}" + workflow_url = f"{org_url}/workflows/{cid}/latest" + + await self._send_encrypted( + receiver_email, + "crawlAutoPaused", + org_name=org.name, + user_name=user_name, + paused_reason=paused_reason, + paused_expiry=paused_expiry.isoformat(), + org_url=org_url, + workflow_url=workflow_url, + support_email=self.support_email, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e0347c6f29..9ab01e317a 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -237,10 +237,30 @@ class UserOrgInfoOut(BaseModel): ] RUNNING_STATES = get_args(TYPE_RUNNING_STATES) -TYPE_WAITING_STATES = Literal[ - "starting", "waiting_capacity", "waiting_org_limit", "paused" +TYPE_MANUALLY_PAUSED_STATES = Literal["paused"] + +TYPE_AUTO_PAUSED_STATES = Literal[ + "paused_storage_quota_reached", + "paused_time_quota_reached", + "paused_org_readonly", +] +AUTO_PAUSED_STATES = get_args(TYPE_AUTO_PAUSED_STATES) + +TYPE_PAUSED_STATES = Literal[ + TYPE_MANUALLY_PAUSED_STATES, + TYPE_AUTO_PAUSED_STATES, +] +PAUSED_STATES = get_args(TYPE_PAUSED_STATES) + +TYPE_WAITING_NOT_PAUSED_STATES = Literal[ + "starting", + "waiting_capacity", + "waiting_org_limit", ] -WAITING_STATES = get_args(TYPE_WAITING_STATES) +WAITING_NOT_PAUSED_STATES = get_args(TYPE_WAITING_NOT_PAUSED_STATES) + +TYPE_WAITING_STATES = Literal[TYPE_PAUSED_STATES, TYPE_WAITING_NOT_PAUSED_STATES] +WAITING_STATES = [*PAUSED_STATES, *WAITING_NOT_PAUSED_STATES] TYPE_FAILED_STATES = Literal[ "canceled", @@ -260,7 +280,7 @@ class UserOrgInfoOut(BaseModel): "stopped_org_readonly", ] SUCCESSFUL_STATES = get_args(TYPE_SUCCESSFUL_STATES) -SUCCESSFUL_AND_PAUSED_STATES = ["paused", *SUCCESSFUL_STATES] +SUCCESSFUL_AND_PAUSED_STATES = [*PAUSED_STATES, *SUCCESSFUL_STATES] TYPE_RUNNING_AND_WAITING_STATES = Literal[TYPE_WAITING_STATES, TYPE_RUNNING_STATES] RUNNING_AND_WAITING_STATES = [*WAITING_STATES, *RUNNING_STATES] @@ -284,8 +304,6 @@ class CrawlStats(BaseModel): done: int = 0 size: int = 0 - profile_update: Optional[str] = "" - # ============================================================================ @@ -887,6 +905,7 @@ class CrawlOut(BaseMongoModel): fileSize: int = 0 fileCount: int = 0 + pendingSize: int = 0 tags: Optional[List[str]] = [] @@ -1071,6 +1090,10 @@ class Crawl(BaseCrawl, CrawlConfigCore): qa: Optional[QARun] = None qaFinished: Optional[Dict[str, QARun]] = {} + pendingSize: int = 0 + + autoPausedEmailsSent: bool = False + # ============================================================================ class CrawlCompleteIn(BaseModel): diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 51bf7a02bb..01b718034d 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -21,13 +21,14 @@ TYPE_NON_RUNNING_STATES, TYPE_RUNNING_STATES, TYPE_ALL_CRAWL_STATES, + TYPE_PAUSED_STATES, RUNNING_STATES, WAITING_STATES, RUNNING_AND_STARTING_ONLY, RUNNING_AND_WAITING_STATES, SUCCESSFUL_STATES, FAILED_STATES, - CrawlStats, + PAUSED_STATES, CrawlFile, CrawlCompleteIn, StorageRef, @@ -44,6 +45,7 @@ from .models import ( CrawlSpec, CrawlStatus, + OpCrawlStats, StopReason, MCBaseRequest, MCSyncData, @@ -394,7 +396,13 @@ async def sync_crawls(self, data: MCSyncData): if status.pagesFound < status.desiredScale: status.desiredScale = max(1, status.pagesFound) - is_paused = bool(crawl.paused_at) and status.state == "paused" + # paused and shut down pods if size is <= 4096 (empty dir), + # paused_at is set, and state is a valid paused state + is_paused = ( + bool(crawl.paused_at) + and status.sizePending <= 4096 + and status.state in PAUSED_STATES + ) for i in range(0, status.desiredScale): if status.pagesFound < i * num_browsers_per_pod: @@ -682,7 +690,7 @@ async def set_state( crawl: CrawlSpec, allowed_from: Sequence[TYPE_ALL_CRAWL_STATES], finished: Optional[datetime] = None, - stats: Optional[CrawlStats] = None, + stats: Optional[OpCrawlStats] = None, ): """set status state and update db, if changed if allowed_from passed in, can only transition from allowed_from state, @@ -833,7 +841,7 @@ async def fail_crawl( crawl: CrawlSpec, status: CrawlStatus, pods: dict, - stats: CrawlStats, + stats: OpCrawlStats, redis: Redis, ) -> bool: """Mark crawl as failed, log crawl state and print crawl logs, if possible""" @@ -976,6 +984,10 @@ async def sync_crawl_state( ) if not crawler_running and redis: + # clear paused key now so can resume + if crawl.paused_at: + await redis.delete(f"{crawl.id}:paused") + # if crawler is not running for REDIS_TTL seconds, also stop redis # but not right away in case crawler pod is just restarting. # avoids keeping redis pods around while no crawler pods are up @@ -1002,12 +1014,12 @@ async def sync_crawl_state( status.lastActiveTime = date_to_str(dt_now()) file_done = await redis.rpop(self.done_key) + while file_done: msg = json.loads(file_done) # add completed file if msg.get("filename"): await self.add_file_to_crawl(msg, crawl, redis) - await redis.incr("filesAdded") # get next file done file_done = await redis.rpop(self.done_key) @@ -1377,7 +1389,7 @@ def get_log_line(self, message, details): } return json.dumps(err) - async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis): + async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis) -> int: """Handle finished CrawlFile to db""" filecomplete = CrawlCompleteIn(**cc_data) @@ -1394,14 +1406,21 @@ async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis): ) await redis.incr("filesAddedSize", filecomplete.size) + await redis.incr("filesAdded") + + # sizes = await redis.hkeys(f"{crawl.id}:size") + # for size in sizes: + # await redis.hmset(f"{crawl.id}:size", {size: 0 for size in sizes}) await self.crawl_ops.add_crawl_file( crawl.db_crawl_id, crawl.is_qa, crawl_file, filecomplete.size ) + await self.org_ops.inc_org_bytes_stored(crawl.oid, filecomplete.size, "crawl") + # no replicas for QA for now if crawl.is_qa: - return True + return filecomplete.size try: await self.background_job_ops.create_replica_jobs( @@ -1411,7 +1430,7 @@ async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis): except Exception as exc: print("Replicate Exception", exc, flush=True) - return True + return filecomplete.size async def is_crawl_stopping( self, crawl: CrawlSpec, status: CrawlStatus @@ -1421,9 +1440,6 @@ async def is_crawl_stopping( if crawl.stopping: return "stopped_by_user" - if crawl.paused_at: - return "paused" - # check timeout if timeout time exceeds elapsed time if crawl.timeout: elapsed = status.elapsedCrawlTime @@ -1438,29 +1454,47 @@ async def is_crawl_stopping( if crawl.max_crawl_size and status.size > crawl.max_crawl_size: return "size-limit" - # gracefully stop crawl if current running crawl sizes reach storage quota + # pause crawl if current running crawl sizes reach storage quota org = crawl.org + # pause crawl if org is set read-only if org.readOnly: - return "stopped_org_readonly" + return self.request_pause_crawl("paused_org_readonly", crawl) + # pause crawl if storage quota is reached if org.quotas.storageQuota: - active_crawls_total_size = await self.crawl_ops.get_active_crawls_size( - crawl.oid + # include not-yet-uploaded pending data from all active crawls + active_crawls_pending_size = ( + await self.crawl_ops.get_active_crawls_pending_size(crawl.oid) ) + if self.org_ops.storage_quota_reached(org, active_crawls_pending_size): + return self.request_pause_crawl("paused_storage_quota_reached", crawl) - if self.org_ops.storage_quota_reached(org, active_crawls_total_size): - return "stopped_storage_quota_reached" - - # gracefully stop crawl is execution time quota is reached + # pause crawl if execution time quota is reached if self.org_ops.exec_mins_quota_reached(org): - return "stopped_time_quota_reached" + return self.request_pause_crawl("paused_time_quota_reached", crawl) + + if crawl.paused_at and status.stopReason not in PAUSED_STATES: + return "paused" + + return None + + def request_pause_crawl( + self, reason: StopReason, crawl: CrawlSpec + ) -> Optional[StopReason]: + """Request crawl to be paused asynchronously, equivalent of user clicking 'pause' button + if crawl is paused, then use the specified reason instead of default paused state + """ + if crawl.paused_at: + return reason + print(f"request pause for {reason}") + self.run_task(self.crawl_ops.pause_crawl(crawl.id, crawl.org, pause=True)) return None async def get_redis_crawl_stats( self, redis: Redis, crawl_id: str - ) -> tuple[CrawlStats, dict[str, Any]]: + ) -> tuple[OpCrawlStats, dict[str, Any]]: """get page stats""" try: # crawler >0.9.0, done key is a value @@ -1480,7 +1514,7 @@ async def get_redis_crawl_stats( profile_update = await redis.get(f"{crawl_id}:profileUploaded") - stats = CrawlStats( + stats = OpCrawlStats( found=pages_found, done=pages_done, size=archive_size, @@ -1500,21 +1534,36 @@ async def update_crawl_state( results = await redis.hgetall(f"{crawl.id}:status") stats, sizes = await self.get_redis_crawl_stats(redis, crawl.id) - # need to add size of previously completed WACZ files as well! + pending_size = stats.size + stats.size += status.filesAddedSize + total_size = stats.size + + print(f"pending size: {pending_size}", flush=True) + print(f"status.filesAdded: {status.filesAdded}", flush=True) + print(f"status.filesAddedSize: {status.filesAddedSize}", flush=True) + print(f"total: {total_size}", flush=True) + print( + f"org quota: {crawl.org.bytesStored + stats.size} <= {crawl.org.quotas.storageQuota}", + flush=True, + ) + # update status status.pagesDone = stats.done status.pagesFound = stats.found - status.size = stats.size + + status.sizePending = pending_size + status.size = total_size status.sizeHuman = humanize.naturalsize(status.size) await self.crawl_ops.update_running_crawl_stats( - crawl.db_crawl_id, crawl.is_qa, stats + crawl.db_crawl_id, crawl.is_qa, stats, pending_size ) for key, value in sizes.items(): increase_storage = False + pod_info = None value = int(value) if value > 0 and status.podStatus: pod_info = status.podStatus[key] @@ -1530,11 +1579,11 @@ async def update_crawl_state( increase_storage = True # out of storage - if pod_info.isNewExit and pod_info.exitCode == 3: + if pod_info and pod_info.isNewExit and pod_info.exitCode == 3: pod_info.used.storage = pod_info.allocated.storage increase_storage = True - if increase_storage: + if pod_info and increase_storage: new_storage = math.ceil( pod_info.used.storage * self.min_avail_storage_ratio / 1_000_000_000 ) @@ -1544,7 +1593,7 @@ async def update_crawl_state( ) # check if no longer paused, clear paused stopping state - if status.stopReason == "paused" and not crawl.paused_at: + if status.stopReason in PAUSED_STATES and not crawl.paused_at: status.stopReason = None status.stopping = False # should have already been removed, just in case @@ -1556,9 +1605,9 @@ async def update_crawl_state( # mark crawl as stopping if status.stopping: - if status.stopReason == "paused": + if status.stopReason in PAUSED_STATES: await redis.set(f"{crawl.id}:paused", "1") - print(f"Crawl pausing, id: {crawl.id}") + print(f"Crawl pausing: {status.stopReason}, id: {crawl.id}") else: await redis.set(f"{crawl.id}:stopping", "1") print( @@ -1582,15 +1631,37 @@ async def update_crawl_state( all_completed = (num_done + num_failed) >= status.scale # check paused - if not all_completed and crawl.paused_at and status.stopReason == "paused": + if not all_completed and crawl.paused_at and status.stopReason in PAUSED_STATES: num_paused = status_count.get("interrupted", 0) if (num_paused + num_failed) >= status.scale: # now fully paused! - # remove pausing key and set state to paused - await redis.delete(f"{crawl.id}:paused") + # remove pausing key and set state to appropriate paused state + paused_state: TYPE_PAUSED_STATES + if status.stopReason == "paused_storage_quota_reached": + paused_state = "paused_storage_quota_reached" + elif status.stopReason == "paused_time_quota_reached": + paused_state = "paused_time_quota_reached" + elif status.stopReason == "paused_org_readoly": + paused_state = "paused_org_readonly" + else: + paused_state = "paused" + + # await redis.delete(f"{crawl.id}:paused") await self.set_state( - "paused", status, crawl, allowed_from=RUNNING_AND_WAITING_STATES + paused_state, + status, + crawl, + allowed_from=RUNNING_AND_WAITING_STATES, ) + + if paused_state != "paused": + await self.crawl_ops.notify_org_admins_of_auto_paused_crawl( + paused_reason=paused_state, + crawl_id=crawl.id, + cid=crawl.cid, + org=crawl.org, + ) + return status # if at least one is done according to redis, consider crawl successful @@ -1648,7 +1719,7 @@ async def mark_finished( crawl: CrawlSpec, status: CrawlStatus, state: TYPE_NON_RUNNING_STATES, - stats: Optional[CrawlStats] = None, + stats: Optional[OpCrawlStats] = None, ) -> bool: """mark crawl as finished, set finished timestamp and final state""" @@ -1692,7 +1763,7 @@ async def do_crawl_finished_tasks( crawl: CrawlSpec, status: CrawlStatus, state: TYPE_NON_RUNNING_STATES, - stats: Optional[CrawlStats], + stats: Optional[OpCrawlStats], ) -> None: """Run tasks after crawl completes in asyncio.task coroutine.""" await self.crawl_config_ops.stats_recompute_last( @@ -1701,9 +1772,6 @@ async def do_crawl_finished_tasks( if state in SUCCESSFUL_STATES and crawl.oid: await self.page_ops.set_archived_item_page_counts(crawl.id) - await self.org_ops.inc_org_bytes_stored( - crawl.oid, status.filesAddedSize, "crawl" - ) await self.org_ops.set_last_crawl_finished(crawl.oid) await self.coll_ops.add_successful_crawl_to_collections( crawl.id, crawl.cid, crawl.oid diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 93677d4bab..446babe196 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -6,7 +6,12 @@ from typing import Optional, DefaultDict, Literal, Annotated, Any from pydantic import BaseModel, Field from kubernetes.utils import parse_quantity -from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES, Organization +from btrixcloud.models import ( + StorageRef, + TYPE_ALL_CRAWL_STATES, + Organization, + CrawlStats, +) BTRIX_API = "btrix.cloud/v1" @@ -25,6 +30,9 @@ "stopped_storage_quota_reached", "stopped_time_quota_reached", "stopped_org_readonly", + "paused_storage_quota_reached", + "paused_time_quota_reached", + "paused_org_readonly", ] @@ -200,6 +208,13 @@ def should_restart_pod(self, forced: bool = False) -> Optional[str]: return None +# ============================================================================ +class OpCrawlStats(CrawlStats): + """crawl stats + internal profile update""" + + profile_update: Optional[str] = "" + + # ============================================================================ # pylint: disable=invalid-name class CrawlStatus(BaseModel): @@ -212,6 +227,9 @@ class CrawlStatus(BaseModel): # human readable size string sizeHuman: str = "" + # pending size (not uploaded) + sizePending: int = 0 + # actual observed scale (number of pods active) scale: int = 0 # desired scale as computed by crawl state (number of pods that should be active) diff --git a/backend/test_nightly/conftest.py b/backend/test_nightly/conftest.py index d984d4642c..d27f73cc86 100644 --- a/backend/test_nightly/conftest.py +++ b/backend/test_nightly/conftest.py @@ -1,7 +1,7 @@ import pytest import requests import time -import datetime +from datetime import datetime, timezone HOST_PREFIX = "http://127.0.0.1:30870" @@ -313,7 +313,7 @@ def error_crawl_id(admin_auth_headers, default_org_id): @pytest.fixture(scope="session") def org_with_quotas(admin_auth_headers): - name = "Quota Org " + datetime.datetime.utcnow().isoformat() + name = "Quota Org " + datetime.now(timezone.utc).isoformat() r = requests.post( f"{API_PREFIX}/orgs/create", headers=admin_auth_headers, json={"name": name} ) diff --git a/backend/test_nightly/test_execution_minutes_quota.py b/backend/test_nightly/test_execution_minutes_quota.py index cd81d86330..d84ca09d04 100644 --- a/backend/test_nightly/test_execution_minutes_quota.py +++ b/backend/test_nightly/test_execution_minutes_quota.py @@ -16,8 +16,6 @@ EXTRA_MINS_QUOTA = 5 EXTRA_SECS_QUOTA = EXTRA_MINS_QUOTA * 60 -config_id = None - def test_set_execution_mins_quota(org_with_quotas, admin_auth_headers): r = requests.post( @@ -29,9 +27,8 @@ def test_set_execution_mins_quota(org_with_quotas, admin_auth_headers): assert data.get("updated") == True -def test_crawl_stopped_when_quota_reached(org_with_quotas, admin_auth_headers): +def test_crawl_paused_when_quota_reached(org_with_quotas, admin_auth_headers): # Run crawl - global config_id crawl_id, config_id = run_crawl(org_with_quotas, admin_auth_headers) time.sleep(1) @@ -49,10 +46,10 @@ def test_crawl_stopped_when_quota_reached(org_with_quotas, admin_auth_headers): ): time.sleep(2) - # Ensure that crawl was stopped by quota + # Ensure that crawl was paused by quota assert ( get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers) - == "stopped_time_quota_reached" + == "paused_time_quota_reached" ) time.sleep(5) @@ -108,16 +105,12 @@ def test_set_execution_mins_extra_quotas(org_with_quotas, admin_auth_headers): @pytest.mark.timeout(1200) -def test_crawl_stopped_when_quota_reached_with_extra( +def test_crawl_paused_when_quota_reached_with_extra( org_with_quotas, admin_auth_headers ): # Run crawl - r = requests.post( - f"{API_PREFIX}/orgs/{org_with_quotas}/crawlconfigs/{config_id}/run", - headers=admin_auth_headers, - ) - assert r.status_code == 200 - crawl_id = r.json()["started"] + crawl_id, config_id = run_crawl(org_with_quotas, admin_auth_headers) + time.sleep(1) while get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers) in ( "starting", @@ -133,10 +126,10 @@ def test_crawl_stopped_when_quota_reached_with_extra( ): time.sleep(2) - # Ensure that crawl was stopped by quota + # Ensure that crawl was paused by quota assert ( get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers) - == "stopped_time_quota_reached" + == "paused_time_quota_reached" ) time.sleep(5) diff --git a/backend/test_nightly/test_storage_quota.py b/backend/test_nightly/test_storage_quota.py index 7945092415..dc8fa9f084 100644 --- a/backend/test_nightly/test_storage_quota.py +++ b/backend/test_nightly/test_storage_quota.py @@ -57,7 +57,7 @@ def test_storage_quota(org_with_quotas, admin_auth_headers): assert r.json()["updated"] -def test_crawl_stopped_when_storage_quota_reached(org_with_quotas, admin_auth_headers): +def test_crawl_paused_when_storage_quota_reached(org_with_quotas, admin_auth_headers): # Run crawl global config_id crawl_id, config_id = run_crawl(org_with_quotas, admin_auth_headers) @@ -82,7 +82,7 @@ def test_crawl_stopped_when_storage_quota_reached(org_with_quotas, admin_auth_he assert ( get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers) - == "stopped_storage_quota_reached" + == "paused_storage_quota_reached" ) # Ensure crawl storage went over quota diff --git a/emails/emails/crawl-auto-paused.tsx b/emails/emails/crawl-auto-paused.tsx new file mode 100644 index 0000000000..58e28cf7ac --- /dev/null +++ b/emails/emails/crawl-auto-paused.tsx @@ -0,0 +1,178 @@ +import { Link, Text } from "@react-email/components"; + +import { Template } from "../templates/btrix.js"; +import { + differenceInDays, + formatDate, + formatRelativeDate, + formatRelativeDateToParts, + offsetDays, +} from "../lib/date.js"; + +import { z } from "zod"; +import { trimTrailingSlash } from "../lib/url.js"; + + +export const schema = z.object({ + user_name: z.string(), + org_name: z.string(), + paused_reason: z.enum(["paused_storage_quota_reached", "paused_time_quota_reached", "paused_org_readonly"]), + paused_expiry: z.coerce.date(), + workflow_url: z.url().transform(trimTrailingSlash), + org_url: z.url().transform(trimTrailingSlash), + support_email: z.email().optional(), +}); + +export type CrawlAutoPausedEmailProps = z.infer; + +export const CrawlAutoPausedEmail = ({ + user_name, + org_name, + paused_reason, + paused_expiry, + workflow_url, + org_url, + support_email +}: CrawlAutoPausedEmailProps) => { + const daysLeft = differenceInDays(new Date(paused_expiry)); + const relativeParts = formatRelativeDateToParts(daysLeft, "days"); + return ( + + ); +}; + +CrawlAutoPausedEmail.PreviewProps = { + user_name: "Tessa", + org_name: "Tessa’s Archives", + paused_reason: "paused_storage_quota_reached", + paused_expiry: offsetDays(7), + workflow_url: "https://dev.browsertrix.com/orgs/default-org/workflows/d4a6cb18-eb54-4d25-a9e8-bb10a3eefa31/latest", + org_url: "https://dev.browsertrix.com/orgs/default-org", + support_email: "support@webrecorder.net", +} satisfies CrawlAutoPausedEmailProps; + +export default CrawlAutoPausedEmail; + +export const subject = () => "Your Browsertrix crawl was automatically paused"; diff --git a/emails/emails/index.ts b/emails/emails/index.ts index 10c71c0f87..cd619ac15a 100644 --- a/emails/emails/index.ts +++ b/emails/emails/index.ts @@ -4,3 +4,4 @@ export * as subscriptionCancel from "./subscription-cancel.js"; export * as trialEndingSoon from "./trial-ending-soon.js"; export * as verifyEmail from "./verify-email.js"; export * as failedBgJob from "./failed-bg-job.js"; +export * as crawlAutoPaused from "./crawl-auto-paused.js"; diff --git a/frontend/README.md b/frontend/README.md index 33f2244e45..29d3742a77 100644 --- a/frontend/README.md +++ b/frontend/README.md @@ -5,12 +5,14 @@ This directory contains the following components: ## Docs (`/docs`) Documentation for running, developing, and writing docs: + - Hosted: [docs.browsertrix.com/develop/docs](https://docs.browsertrix.com/develop/docs) - Local: [develop/docs.md](./docs/docs/develop/docs.md) ## User Interface (`/src`) Documentation for developing the user interface: + - Hosted: [docs.browsertrix.com/develop/frontend-dev](https://docs.browsertrix.com/develop/frontend-dev) - Local: [develop/frontend-dev.md](./docs/docs/develop/frontend-dev.md) diff --git a/frontend/src/features/archived-items/crawl-status.ts b/frontend/src/features/archived-items/crawl-status.ts index ed5a23cdaa..9c7b70eb2a 100644 --- a/frontend/src/features/archived-items/crawl-status.ts +++ b/frontend/src/features/archived-items/crawl-status.ts @@ -6,6 +6,7 @@ import startCase from "lodash/fp/startCase"; import { TailwindElement } from "@/classes/TailwindElement"; import { labelWithIcon } from "@/layouts/labelWithIcon"; import { RUNNING_STATES, type CrawlState } from "@/types/crawlState"; +import { isPaused } from "@/utils/crawler"; import { animatePulse } from "@/utils/css"; type CrawlType = "crawl" | "qa"; @@ -170,6 +171,36 @@ export class CrawlStatus extends TailwindElement { label = msg("Paused"); break; + case "paused_storage_quota_reached": + color = "var(--sl-color-neutral-500)"; + icon = html``; + label = msg("Paused: Storage Quota Reached"); + break; + + case "paused_time_quota_reached": + color = "var(--sl-color-neutral-500)"; + icon = html``; + label = msg("Paused: Time Quota Reached"); + break; + + case "paused_org_readonly": + color = "var(--sl-color-neutral-500)"; + icon = html``; + label = msg("Paused: Crawling Disabled"); + break; + case "pending-wait": color = "var(--sl-color-violet-600)"; icon = html` diff --git a/frontend/src/stories/design/status-indicators.mdx b/frontend/src/stories/design/status-indicators.mdx index 4bae8e1077..144fb14294 100644 --- a/frontend/src/stories/design/status-indicators.mdx +++ b/frontend/src/stories/design/status-indicators.mdx @@ -1,6 +1,7 @@ -import { Meta, Canvas } from '@storybook/addon-docs/blocks'; -import { AllStates } from '../features/archived-items/CrawlStatus.stories'; +import { Canvas, Meta } from "@storybook/addon-docs/blocks"; + import { ColorSwatch } from "../../../.storybook/blocks/ColorSwatch"; +import { AllStates } from "../features/archived-items/CrawlStatus.stories"; @@ -21,39 +22,38 @@ further clarity as to what they indicate. ### Generalized Status -| Icon & Label | Color | Context | Description | -| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------- | --------------- | --------------------------------------------------------------- | -| 1 Crawl Running | Shoelace `--sl-color-success-600` | Dashboard | Count of crawls in "running" status when that count is non-zero | -| 0 Crawls Running | Shoelace Shoelace `--sl-color-neutral-600` | Dashboard | Count of crawls in "running" status when that count is zero | -| 1 Crawl Workflow Waiting | Shoelace `--sl-color-violet-600` | Dashboard | Count of crawls in "waiting" status | -| No Crawls Yet | Shoelace `--sl-color-neutral-400` | Crawl Workflows | Used to show that a workflow has no crawls | -| Complete | Shoelace `--sl-color-success-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was completed | -| Stopped | Shoelace `--sl-color-warning-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was stopped by the user | -| Stopped: [Reason] | Shoelace `--sl-color-warning-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was stopped due to system reason | -| Canceled | Shoelace `--sl-color-neutral-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was canceled | -| Starting | Shoelace `--sl-color-violet-600` | Crawl Workflows | Used to show that a crawl is starting | -| Running | Shoelace `--sl-color-success-600` | Crawl Workflows | Used to show that a crawl is running | -| Behavior timed out | Shoelace `--sl-color-warning-600` | Crawl Logs | Used to show a warning log from a behavior | -| Success | Shoelace `--sl-color-success-600` | Toasts | Used to show a success notification | -| Warning | Shoelace `--sl-color-warning-600` | Toasts | Used to show a warning notification | -| Danger | Shoelace `--sl-color-danger-600` | Toasts | Used to show an error notification | - +| Icon & Label | Color | Context | Description | +| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------- | --------------- | --------------------------------------------------------------------------------- | +| 1 Crawl Running | Shoelace `--sl-color-success-600` | Dashboard | Count of crawls in "running" status when that count is non-zero | +| 0 Crawls Running | Shoelace Shoelace `--sl-color-neutral-600` | Dashboard | Count of crawls in "running" status when that count is zero | +| 1 Crawl Workflow Waiting | Shoelace `--sl-color-violet-600` | Dashboard | Count of crawls in "waiting" status | +| No Crawls Yet | Shoelace `--sl-color-neutral-400` | Crawl Workflows | Used to show that a workflow has no crawls | +| Complete | Shoelace `--sl-color-success-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was completed | +| Stopped | Shoelace `--sl-color-warning-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was stopped by the user | +| Stopped: [Reason] | Shoelace `--sl-color-warning-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was stopped due to system reason | +| Canceled | Shoelace `--sl-color-neutral-600` | Crawl Workflows | Used to show that a workflow's most recent crawl was canceled | +| Starting | Shoelace `--sl-color-violet-600` | Crawl Workflows | Used to show that a crawl is starting | +| Running | Shoelace `--sl-color-success-600` | Crawl Workflows | Used to show that a crawl is running | +| Behavior timed out | Shoelace `--sl-color-warning-600` | Crawl Logs | Used to show a warning log from a behavior | +| Success | Shoelace `--sl-color-success-600` | Toasts | Used to show a success notification | +| Warning | Shoelace `--sl-color-warning-600` | Toasts | Used to show a warning notification | +| Danger | Shoelace `--sl-color-danger-600` | Toasts | Used to show an error notification | ### Exact Status - + ## Intended Implementation -| Status | Color | Description | Icons | Examples | -| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Empty | Shoelace `--sl-color-neutral-400` | Used for empty states where no data is present | `slash-circle` | No Crawls Yet | -| Pending | Shoelace `--sl-color-violet-600` | Used when a process is queued or starting but is not yet running. Should be animated when indicating the status of a single object. | `hourglass-split`, or the icon of the next state being transitioned to (pulsing) | 1 Crawl Workflow Waiting
Starting
Resuming | -| Running | Shoelace `--sl-color-success-600` | Used when a process is actively running. Should be animated when indicating the status of a single object. | `dot` | Running | -| Paused | Shoelace `--sl-color-neutral-600` | Used for paused states | `pause-circle` or `play-circle` | Pause
Resume | -| Success | Shoelace `--sl-color-success-600` | Used for positive / successful states | `check-circle-fill` or `check2-circle` | Complete | -| Info | Shoelace `--sl-color-neutral-500` | Used for neutral, informational states | `info-circle-fill` | Behavior Log | -| Incomplete | Shoelace `--sl-color-warning-600` | Used for states that are ambiguous or partially satisfied, but no longer running | `dash-square-fill` | Stopped | -| Warning | Shoelace `--sl-color-warning-600` | Used for warning states, something is wrong but not critically | `exclamation-diamond-fill` or `exclamation-diamond` | Warning | -| Danger | Shoelace `--sl-color-danger-600` | Used for non-fatal errors that may be addressed by the user | `exclamation-triangle-fill` or `exclamation-triangle` | Payment Failed | -| Fatal | Shoelace `--sl-color-danger-600` | Used for fatal errors and actions that result in data loss | `x-octagon-fill` or `x-octagon` | Cancel | +| Status | Color | Description | Icons | Examples | +| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Empty | Shoelace `--sl-color-neutral-400` | Used for empty states where no data is present | `slash-circle` | No Crawls Yet | +| Pending | Shoelace `--sl-color-violet-600` | Used when a process is queued or starting but is not yet running. Should be animated when indicating the status of a single object. | `hourglass-split`, or the icon of the next state being transitioned to (pulsing) | 1 Crawl Workflow Waiting
Starting
Resuming | +| Running | Shoelace `--sl-color-success-600` | Used when a process is actively running. Should be animated when indicating the status of a single object. | `dot` | Running | +| Paused | Shoelace `--sl-color-neutral-600` | Used for paused states | `pause-circle` or `play-circle` | Pause
Resume | +| Success | Shoelace `--sl-color-success-600` | Used for positive / successful states | `check-circle-fill` or `check2-circle` | Complete | +| Info | Shoelace `--sl-color-neutral-500` | Used for neutral, informational states | `info-circle-fill` | Behavior Log | +| Incomplete | Shoelace `--sl-color-warning-600` | Used for states that are ambiguous or partially satisfied, but no longer running | `dash-square-fill` | Stopped | +| Warning | Shoelace `--sl-color-warning-600` | Used for warning states, something is wrong but not critically | `exclamation-diamond-fill` or `exclamation-diamond` | Warning | +| Danger | Shoelace `--sl-color-danger-600` | Used for non-fatal errors that may be addressed by the user | `exclamation-triangle-fill` or `exclamation-triangle` | Payment Failed | +| Fatal | Shoelace `--sl-color-danger-600` | Used for fatal errors and actions that result in data loss | `x-octagon-fill` or `x-octagon` | Cancel | diff --git a/frontend/src/types/crawlState.ts b/frontend/src/types/crawlState.ts index 6abc45b67b..f32e06c1ae 100644 --- a/frontend/src/types/crawlState.ts +++ b/frontend/src/types/crawlState.ts @@ -6,12 +6,25 @@ export const RUNNING_STATES = [ "uploading-wacz", ] as const; -// Match backend TYPE_WAITING_STATES in models.py -export const WAITING_STATES = [ +// Match backend TYPE_WAITING_NOT_PAUSED_STATES in models.py +export const WAITING_NOT_PAUSED_STATES = [ "starting", "waiting_capacity", "waiting_org_limit", +] as const; + +// Match backend TYPE_PAUSED_STATES in models.py +export const PAUSED_STATES = [ "paused", + "paused_storage_quota_reached", + "paused_time_quota_reached", + "paused_org_readonly", +] as const; + +// Match backend TYPE_WAITING_STATES in models.py +export const WAITING_STATES = [ + ...WAITING_NOT_PAUSED_STATES, + ...PAUSED_STATES, ] as const; // Match backend TYPE_SUCCESSFUL_STATES in models.py diff --git a/frontend/src/utils/crawler.ts b/frontend/src/utils/crawler.ts index a0cfa35eca..76be91de31 100644 --- a/frontend/src/utils/crawler.ts +++ b/frontend/src/utils/crawler.ts @@ -5,6 +5,7 @@ import { html, type TemplateResult } from "lit"; import type { ArchivedItem, Crawl, Upload, Workflow } from "@/types/crawler"; import { FAILED_STATES, + PAUSED_STATES, RUNNING_AND_WAITING_STATES, SUCCESSFUL_AND_FAILED_STATES, SUCCESSFUL_STATES, @@ -51,6 +52,10 @@ export function isNotFailed({ state }: { state: string | null }) { ); } +export function isPaused(state: string | null) { + return state && (PAUSED_STATES as readonly string[]).includes(state); +} + export function isPageScopeType( scope?: (typeof WorkflowScopeType)[keyof typeof WorkflowScopeType], ) {