Skip to content

Commit c14df11

Browse files
authored
fix cleanup job to delete orphaned seed files after org has been deleted (#2920)
- in seed cleanup, catch if org doesn't exist and attempt to delete file directly - add delete_file_from_default_storage() to allow deleting left-over seed files, assuming they are stored in default storage - side effect of #2918, which will be fixed in #2919 - no need for migration, as will be picked up existing cleanup seed files job
1 parent 9a734a1 commit c14df11

File tree

3 files changed

+40
-6
lines changed

3 files changed

+40
-6
lines changed

backend/btrixcloud/file_uploads.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,9 +198,12 @@ async def upload_user_file_stream(
198198

199199
file_id = uuid4()
200200

201-
new_filename = f"{upload_type}-{str(file_id)}{extension}"
201+
new_filename = f"{str(file_id)}{extension}"
202202

203-
prefix = org.storage.get_storage_extra_path(str(org.id)) + f"{upload_type}s/"
203+
prefix = (
204+
org.storage.get_storage_extra_path(str(org.id))
205+
+ f"{upload_type}s/{upload_type}"
206+
)
204207

205208
file_prep = UserFilePreparer(
206209
prefix,
@@ -383,6 +386,22 @@ async def cleanup_unused_seed_files(self):
383386
org = await self.org_ops.get_org_by_id(file_dict["oid"])
384387
await self.delete_seed_file(file_id, org)
385388
print(f"Deleted unused seed file {file_id}", flush=True)
389+
390+
except HTTPException as e:
391+
# handle case where org is deleted by seed file still exists
392+
if e.detail == "invalid_org_id":
393+
try:
394+
await self.storage_ops.delete_file_from_default_storage(
395+
file_dict["filename"]
396+
)
397+
await self.files.delete_one({"_id": file_id})
398+
# pylint: disable=broad-exception-caught
399+
except Exception as err:
400+
print(
401+
f"Error deleting orphaned seed file without org {file_id}: {err}",
402+
flush=True,
403+
)
404+
386405
# pylint: disable=broad-exception-caught
387406
except Exception as err:
388407
print(f"Error deleting unused seed file {file_id}: {err}", flush=True)

backend/btrixcloud/storages.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373

7474
# ============================================================================
7575
# pylint: disable=broad-except,raise-missing-from,too-many-instance-attributes
76-
# pylint: disable=too-many-public-methods
76+
# pylint: disable=too-many-public-methods, too-many-lines
7777
class StorageOps:
7878
"""All storage handling, download/upload operations"""
7979

@@ -638,17 +638,32 @@ async def _delete_file(
638638
self, org: Organization, filename: str, storage: StorageRef
639639
) -> bool:
640640
"""delete specified file from storage"""
641-
status_code = None
642-
643641
s3storage = self.get_org_storage_by_ref(org, storage)
644642

643+
return await self._delete_file_from_storage(s3storage, filename)
644+
645+
async def _delete_file_from_storage(self, s3storage: S3Storage, filename: str):
646+
"""delete file from specified storage"""
647+
status_code = None
648+
645649
async with self.get_s3_client(s3storage) as (client, bucket, key):
646650
key += filename
647651
response = await client.delete_object(Bucket=bucket, Key=key)
648652
status_code = response["ResponseMetadata"]["HTTPStatusCode"]
649653

650654
return status_code == 204
651655

656+
async def delete_file_from_default_storage(self, filename: str):
657+
"""delete file from default primary storage, if it exists"""
658+
if not self.default_primary:
659+
return False
660+
661+
s3storage = self.default_storages.get(self.default_primary.name)
662+
if not s3storage:
663+
return False
664+
665+
return await self._delete_file_from_storage(s3storage, filename)
666+
652667
async def sync_stream_wacz_pages(
653668
self, wacz_files: List[CrawlFileOut], num_retries=5
654669
) -> Iterator[Dict[Any, Any]]:

scripts/build-backend.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env bash
22
CURR=$(dirname "${BASH_SOURCE[0]}")
33

4-
docker build -t ${REGISTRY}webrecorder/browsertrix-backend:latest $CURR/../backend/
4+
docker build --load -t ${REGISTRY}webrecorder/browsertrix-backend:latest $CURR/../backend/
55

66
if [ -n "$REGISTRY" ]; then
77
docker push ${REGISTRY}webrecorder/browsertrix-backend

0 commit comments

Comments
 (0)