Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/k3d-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ jobs:
needs: paths-filter
if: needs.paths-filter.outputs.matches == 'true'
steps:
- name: Initial Disk Cleanup
uses: mathio/gha-cleanup@v1
with:
remove-browsers: true
verbose: true


- name: Create k3d Cluster
uses: AbsaOSS/k3d-action@v2
with:
Expand Down
4 changes: 3 additions & 1 deletion backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,9 +464,11 @@ async def _resolve_crawl_refs(
raise HTTPException(status_code=400, detail="missing_org")

if hasattr(crawl, "profileid") and crawl.profileid:
crawl.profileName = await self.crawl_configs.profiles.get_profile_name(
profile = await self.crawl_configs.profiles.get_profile(
crawl.profileid, org
)
if profile:
crawl.profileName = profile.name

if (
files
Expand Down
55 changes: 32 additions & 23 deletions backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,23 +265,18 @@ async def add_crawl_config(
proxy_id = config_in.proxyId

profileid = None
# ensure profile is valid, get proxy_id from profile
if isinstance(config_in.profileid, UUID):
profileid = config_in.profileid

# ensure profile is valid, get proxy_id from profile
if profileid:
profile = await self.profiles.get_profile(profileid, org)
proxy_id = profile.proxyId
proxy_id = None
else:
if config_in.config and config_in.config.failOnContentCheck:
raise HTTPException(
status_code=400, detail="fail_on_content_check_requires_profile"
)

# ensure proxy_id is valid and available for org
if proxy_id:
if not self.can_org_use_proxy(org, proxy_id):
raise HTTPException(status_code=404, detail="proxy_not_found")
self.assert_can_org_use_proxy(org, proxy_id)

if config_in.config.exclude:
exclude = config_in.config.exclude
Expand Down Expand Up @@ -599,10 +594,20 @@ async def update_crawl_config(
changed = changed or (
update.profileid is not None
and update.profileid != orig_crawl_config.profileid
and ((not update.profileid) != (not orig_crawl_config.profileid))
and not (update.profileid == "" and not orig_crawl_config.profileid)
)

# either unsetting profile or no profile set on current config
no_profile = update.profileid == "" or (
update.profileid is None and not orig_crawl_config.profileid
)

changed = changed or (orig_crawl_config.proxyId != update.proxyId)
changed = changed or (
no_profile
and update.proxyId is not None
and orig_crawl_config.proxyId != update.proxyId
and not (update.proxyId == "" and not orig_crawl_config.proxyId)
)

metadata_changed = self.check_attr_changed(orig_crawl_config, update, "name")
metadata_changed = metadata_changed or self.check_attr_changed(
Expand Down Expand Up @@ -633,8 +638,6 @@ async def update_crawl_config(
last_rev = ConfigRevision(**orig_dict)
last_rev = await self.config_revs.insert_one(last_rev.to_dict())

proxy_id = update.proxyId

# set update query
query = update.dict(exclude_unset=True)
query["modifiedBy"] = user.id
Expand All @@ -647,14 +650,15 @@ async def update_crawl_config(
# else, ensure its a valid profile
elif update.profileid:
profile = await self.profiles.get_profile(cast(UUID, update.profileid), org)
self.assert_can_org_use_proxy(org, profile.proxyId)
query["profileid"] = update.profileid
proxy_id = profile.proxyId
# don't change the proxy if profile is set, as it should match the profile proxy
elif orig_crawl_config.profileid:
proxy_id = None

if proxy_id is not None:
query["proxyId"] = proxy_id
if no_profile:
if update.proxyId == "":
query["proxyId"] = None
elif update.proxyId:
self.assert_can_org_use_proxy(org, update.proxyId)
query["proxyId"] = update.proxyId

if update.config is not None:
query["config"] = update.config.dict()
Expand Down Expand Up @@ -1025,9 +1029,10 @@ async def get_crawl_config_out(self, cid: UUID, org: Organization):
await self._add_running_curr_crawl_stats(crawlconfig)

if crawlconfig.profileid:
crawlconfig.profileName = await self.profiles.get_profile_name(
crawlconfig.profileid, org
)
profile = await self.profiles.get_profile(crawlconfig.profileid, org)
if profile:
crawlconfig.profileName = profile.name
crawlconfig.proxyId = profile.proxyId

crawlconfig.config.seeds = None

Expand Down Expand Up @@ -1241,8 +1246,7 @@ async def run_now_internal(
else:
profile_filename = ""

if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId):
raise HTTPException(status_code=404, detail="proxy_not_found")
self.assert_can_org_use_proxy(org, crawlconfig.proxyId)

storage_filename = (
crawlconfig.crawlFilenameTemplate or self.default_filename_template
Expand Down Expand Up @@ -1418,6 +1422,11 @@ def can_org_use_proxy(self, org: Organization, proxy: CrawlerProxy | str) -> boo
_proxy.shared and org.allowSharedProxies
) or _proxy.id in org.allowedProxies

def assert_can_org_use_proxy(self, org: Organization, proxy: Optional[str]):
"""assert that proxy can be used or throw error"""
if proxy and not self.can_org_use_proxy(org, proxy):
raise HTTPException(status_code=400, detail="proxy_not_found")

def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
"""Generate WARC prefix slug from org slug, name or url
if no name is provided, hostname is used from url, otherwise
Expand Down
2 changes: 1 addition & 1 deletion backend/btrixcloud/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
) = PageOps = BackgroundJobOps = FileUploadOps = CrawlLogOps = CrawlManager = object


CURR_DB_VERSION = "0052"
CURR_DB_VERSION = "0054"


# ============================================================================
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Migration 0054 -- clear proxyId on workflows that have profile set
using proxyId from profile always
"""

from btrixcloud.migrations import BaseMigration


MIGRATION_VERSION = "0054"


class Migration(BaseMigration):
"""Migration class."""

# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)

async def migrate_up(self):
"""Perform migration up.

Unset proxyId on workflows that have a profileid set
"""
crawl_configs = self.mdb["crawl_configs"]

try:
await crawl_configs.update_many(
{"profileid": {"$ne": None}, "proxyId": {"$ne": None}},
{"$set": {"proxyId": None}},
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Error update crawl_configs: {err}",
flush=True,
)
2 changes: 2 additions & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2428,6 +2428,8 @@ class ProfileFile(BaseFile):
class Profile(BaseMongoModel):
"""Browser profile"""

id: UUID

name: str
description: Optional[str] = ""

Expand Down
4 changes: 4 additions & 0 deletions backend/btrixcloud/orgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1300,6 +1300,10 @@ async def import_org(
if not workflow.get("crawlerChannel"):
workflow["crawlerChannel"] = "default"

# Ensure proxyId is unset if profile is set
if workflow.get("profileid"):
workflow["proxyId"] = None

crawl_config = CrawlConfig.from_dict(workflow)
await self.crawl_configs_db.insert_one(crawl_config.to_dict())

Expand Down
53 changes: 53 additions & 0 deletions backend/test/test_crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1074,6 +1074,59 @@ def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id):
assert page["url"]


def test_update_profile(crawler_auth_headers, default_org_id, profile_id, profile_2_id):

def get_profile():
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
return r.json()["profileid"]

def update_profile(profileid):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"profileid": profileid},
)
assert r.status_code == 200
return r.json()

# No profile to start
assert get_profile() == None

# Add profile
data = update_profile(profile_2_id)
assert data["settings_changed"] == True
assert data["metadata_changed"] == False
assert get_profile() == profile_2_id

# Same profile
data = update_profile(profile_2_id)
assert data["settings_changed"] == False
assert data["metadata_changed"] == False
assert get_profile() == profile_2_id

# Add different profile
data = update_profile(profile_id)
assert data["settings_changed"] == True
assert data["metadata_changed"] == False
assert get_profile() == profile_id

# Remove profile
data = update_profile("")
assert data["settings_changed"] == True
assert data["metadata_changed"] == False
assert get_profile() == None

# No change
data = update_profile("")
assert data["settings_changed"] == False
assert data["metadata_changed"] == False
assert get_profile() == None


def test_add_crawl_config_fail_on_content_check_no_profile(
crawler_auth_headers, default_org_id, sample_crawl_data
):
Expand Down
6 changes: 6 additions & 0 deletions chart/test/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ operator_resync_seconds: 3

qa_scale: 2

# lower storage sizes
redis_storage: "100Mi"
profile_browser_workdir_size: "100Mi"
crawler_storage: "500Mi"


# for testing only
crawler_extra_cpu_per_browser: 300m

Expand Down
Loading