Skip to content

Commit 2c7854c

Browse files
authored
misc backend profile fixes: (#2973)
- update inUse for profile list (fixes #2970) - ensure origins for existing profiles returns all origins - ensure proxyId and crawlerChannel when committing are set from the browser, not from API request internal improvements: - add crawlerChannel and proxyId to ProfileJob labels - convert labels to typed ProfileBrowserMetadata
1 parent 26c100d commit 2c7854c

File tree

7 files changed

+152
-49
lines changed

7 files changed

+152
-49
lines changed

backend/btrixcloud/crawlconfigs.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
ListFilterType,
5454
ScopeType,
5555
Seed,
56+
Profile,
5657
)
5758
from .utils import (
5859
dt_now,
@@ -199,6 +200,14 @@ async def init_index(self):
199200
[("oid", pymongo.ASCENDING), ("tags", pymongo.ASCENDING)]
200201
)
201202

203+
await self.crawl_configs.create_index(
204+
[
205+
("oid", pymongo.ASCENDING),
206+
("inactive", pymongo.ASCENDING),
207+
("profileid", pymongo.ASCENDING),
208+
]
209+
)
210+
202211
await self.crawl_configs.create_index(
203212
[("lastRun", pymongo.DESCENDING), ("modified", pymongo.DESCENDING)]
204213
)
@@ -834,10 +843,37 @@ async def get_crawl_configs(
834843
async def is_profile_in_use(self, profileid: UUID, org: Organization) -> bool:
835844
"""return true/false if any active workflows exist with given profile"""
836845
res = await self.crawl_configs.find_one(
837-
{"profileid": profileid, "inactive": {"$ne": True}, "oid": org.id}
846+
{"oid": org.id, "inactive": {"$ne": True}, "profileid": profileid}
838847
)
839848
return res is not None
840849

850+
async def mark_profiles_in_use(self, profiles: List[Profile], org: Organization):
851+
"""mark which profiles are in use by querying and grouping crawlconfigs"""
852+
profile_ids = [profile.id for profile in profiles]
853+
cursor = self.crawl_configs.aggregate(
854+
[
855+
{
856+
"$match": {
857+
"oid": org.id,
858+
"inactive": {"$ne": True},
859+
"profileid": {"$in": profile_ids},
860+
}
861+
},
862+
{"$group": {"_id": "$profileid", "count": {"$sum": 1}}},
863+
]
864+
)
865+
results = await cursor.to_list()
866+
in_use = set()
867+
for res in results:
868+
if res.get("count") > 0:
869+
in_use.add(res.get("_id"))
870+
871+
for profile in profiles:
872+
if profile.id in in_use:
873+
profile.inUse = True
874+
875+
return profiles
876+
841877
async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:
842878
"""Return the id of currently running crawl for this config, if any"""
843879
# crawls = await self.crawl_manager.list_running_crawls(cid=crawlconfig.id)

backend/btrixcloud/crawlmanager.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from .utils import dt_now, date_to_str, scale_from_browser_windows
1212
from .k8sapi import K8sAPI
1313

14-
from .models import StorageRef, CrawlConfig, BgJobType
14+
from .models import StorageRef, CrawlConfig, BgJobType, ProfileBrowserMetadata
1515

1616

1717
# ============================================================================
@@ -25,13 +25,14 @@
2525
class CrawlManager(K8sAPI):
2626
"""abstract crawl manager"""
2727

28-
# pylint: disable=too-many-arguments
28+
# pylint: disable=too-many-arguments, too-many-locals
2929
async def run_profile_browser(
3030
self,
3131
userid: str,
3232
oid: str,
3333
url: str,
3434
storage: StorageRef,
35+
crawler_channel: str,
3536
crawler_image: str,
3637
image_pull_policy: str,
3738
baseprofile: str = "",
@@ -59,6 +60,7 @@ async def run_profile_browser(
5960
"url": url,
6061
"vnc_password": secrets.token_hex(16),
6162
"expire_time": date_to_str(dt_now() + timedelta(seconds=30)),
63+
"crawler_channel": crawler_channel,
6264
"crawler_image": crawler_image,
6365
"image_pull_policy": image_pull_policy,
6466
"proxy_id": proxy_id or DEFAULT_PROXY_ID,
@@ -420,20 +422,17 @@ async def add_org_storage(
420422
name=storage_secret, namespace=self.namespace, body=crawl_secret
421423
)
422424

423-
async def get_profile_browser_metadata(self, browserid: str) -> dict[str, str]:
424-
"""get browser profile labels"""
425-
try:
426-
browser = await self.get_profile_browser(browserid)
427-
428-
# pylint: disable=bare-except
429-
except:
430-
return {}
425+
async def get_profile_browser_metadata(
426+
self, browserid: str
427+
) -> ProfileBrowserMetadata:
428+
"""get browser profile metadata from labels"""
429+
browser = await self.get_profile_browser(browserid)
431430

432431
metadata = browser["metadata"]["labels"]
433432

434433
metadata["committing"] = browser.get("spec", {}).get("committing")
435434

436-
return metadata
435+
return ProfileBrowserMetadata(**metadata)
437436

438437
async def keep_alive_profile_browser(self, browserid: str, committing="") -> None:
439438
"""update profile browser to not expire"""

backend/btrixcloud/models.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2455,6 +2455,25 @@ class Profile(BaseMongoModel):
24552455
inUse: bool = False
24562456

24572457

2458+
# ============================================================================
2459+
class ProfileBrowserMetadata(BaseModel):
2460+
"""Profile metadata stored in ProfileJob labels"""
2461+
2462+
browser: str
2463+
2464+
oid: str = Field(alias="btrix.org")
2465+
userid: UUID = Field(alias="btrix.user")
2466+
baseprofile: Optional[UUID] = Field(alias="btrix.baseprofile", default=None)
2467+
storage: str = Field(alias="btrix.storage")
2468+
2469+
profileid: UUID
2470+
2471+
proxyid: str = ""
2472+
crawlerChannel: str
2473+
2474+
committing: Optional[str] = None
2475+
2476+
24582477
# ============================================================================
24592478
class UrlIn(BaseModel):
24602479
"""Request to set url"""
@@ -2485,17 +2504,14 @@ class ProfileCreate(BaseModel):
24852504
browserid: str
24862505
name: str
24872506
description: Optional[str] = ""
2488-
crawlerChannel: str = "default"
2489-
proxyId: Optional[str] = None
24902507

24912508

24922509
# ============================================================================
2493-
class ProfileUpdate(BaseModel):
2510+
class ProfileUpdate(ProfileCreate):
24942511
"""Update existing profile with new browser profile or metadata only"""
24952512

2496-
browserid: Optional[str] = ""
2497-
name: str
2498-
description: Optional[str] = ""
2513+
# browserid optional if only updating metadata
2514+
browserid: str = ""
24992515

25002516

25012517
# ============================================================================

backend/btrixcloud/profiles.py

Lines changed: 63 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
SuccessResponseStorageQuota,
3434
ProfilePingResponse,
3535
ProfileBrowserGetUrlResponse,
36+
ProfileBrowserMetadata,
3637
)
3738
from .utils import dt_now, str_to_date
3839

@@ -130,6 +131,7 @@ async def create_new_browser(
130131
str(org.id),
131132
url=str(profile_launch.url),
132133
storage=org.storage,
134+
crawler_channel=profile_launch.crawlerChannel,
133135
crawler_image=crawler_image,
134136
image_pull_policy=image_pull_policy,
135137
baseprofile=prev_profile_id,
@@ -172,11 +174,20 @@ async def get_profile_browser_url(
172174
params["url"] = url
173175
return params
174176

175-
async def ping_profile_browser(self, browserid: str) -> dict[str, Any]:
177+
async def ping_profile_browser(
178+
self, metadata: ProfileBrowserMetadata, org: Organization
179+
) -> dict[str, Any]:
176180
"""ping profile browser to keep it running"""
177-
data = await self._send_browser_req(browserid, "/ping")
181+
data = await self._send_browser_req(metadata.browser, "/ping")
182+
origins = data.get("origins") or []
183+
184+
if metadata.baseprofile:
185+
base = await self.get_profile(metadata.baseprofile, org)
186+
for origin in base.origins:
187+
if origin not in origins:
188+
origins.append(origin)
178189

179-
return {"success": True, "origins": data.get("origins") or []}
190+
return {"success": True, "origins": origins}
180191

181192
async def navigate_profile_browser(
182193
self, browserid: str, urlin: UrlIn
@@ -190,21 +201,19 @@ async def navigate_profile_browser(
190201

191202
async def commit_to_profile(
192203
self,
193-
metadata: dict,
204+
metadata: ProfileBrowserMetadata,
194205
browser_commit: ProfileCreate,
195206
org: Organization,
196207
user: User,
197208
existing_profile: Optional[Profile] = None,
198209
) -> dict[str, Any]:
199210
"""commit to profile async, returning if committed, or waiting"""
200-
profileid = metadata.get("profileid")
201-
if not profileid:
211+
if not metadata.profileid:
202212
raise HTTPException(status_code=400, detail="browser_not_valid")
203213

204214
self.orgs.can_write_data(org, include_time=False)
205215

206-
committing = metadata.get("committing")
207-
if not committing:
216+
if not metadata.committing:
208217
self._run_task(
209218
self.do_commit_to_profile(
210219
metadata=metadata,
@@ -215,19 +224,19 @@ async def commit_to_profile(
215224
)
216225
)
217226

218-
if committing == "done":
227+
if metadata.committing == "done":
219228
await self.crawl_manager.delete_profile_browser(browser_commit.browserid)
220229
return {
221230
"added": True,
222-
"id": profileid,
231+
"id": str(metadata.profileid),
223232
"storageQuotaReached": self.orgs.storage_quota_reached(org),
224233
}
225234

226235
raise HTTPException(status_code=200, detail="waiting_for_browser")
227236

228237
async def do_commit_to_profile(
229238
self,
230-
metadata: dict,
239+
metadata: ProfileBrowserMetadata,
231240
browser_commit: ProfileCreate,
232241
org: Organization,
233242
user: User,
@@ -238,6 +247,8 @@ async def do_commit_to_profile(
238247
try:
239248
now = dt_now()
240249

250+
origins = []
251+
241252
if existing_profile:
242253
profileid = existing_profile.id
243254
created = existing_profile.created
@@ -246,8 +257,11 @@ async def do_commit_to_profile(
246257
prev_file_size = (
247258
existing_profile.resource.size if existing_profile.resource else 0
248259
)
260+
261+
origins = existing_profile.origins
262+
249263
else:
250-
profileid = UUID(metadata["profileid"])
264+
profileid = metadata.profileid
251265
created = now
252266
created_by = user.id
253267
created_by_name = user.name if user.name else user.email
@@ -275,10 +289,15 @@ async def do_commit_to_profile(
275289
storage=org.storage,
276290
)
277291

278-
baseid = metadata.get("btrix.baseprofile")
279-
if baseid:
280-
print("baseid", baseid)
281-
baseid = UUID(baseid)
292+
baseid = metadata.baseprofile
293+
294+
if origins:
295+
for origin in data["origins"]:
296+
if origin not in origins:
297+
origins.append(origin)
298+
299+
else:
300+
origins = data["origins"]
282301

283302
profile = Profile(
284303
id=profileid,
@@ -290,13 +309,13 @@ async def do_commit_to_profile(
290309
modified=now,
291310
modifiedBy=user.id,
292311
modifiedByName=user.name if user.name else user.email,
293-
origins=data["origins"],
312+
origins=origins,
294313
resource=profile_file,
295-
userid=UUID(metadata.get("btrix.user")),
314+
userid=metadata.userid,
296315
oid=org.id,
297316
baseid=baseid,
298-
crawlerChannel=browser_commit.crawlerChannel,
299-
proxyId=browser_commit.proxyId,
317+
crawlerChannel=metadata.crawlerChannel,
318+
proxyId=metadata.proxyid,
300319
)
301320

302321
await self.profiles.find_one_and_update(
@@ -455,6 +474,9 @@ async def list_profiles(
455474
total = 0
456475

457476
profiles = [Profile.from_dict(res) for res in items]
477+
478+
profiles = await self.crawlconfigs.mark_profiles_in_use(profiles, org)
479+
458480
return profiles, total
459481

460482
async def get_profile(self, profileid: UUID, org: Organization) -> Profile:
@@ -611,15 +633,27 @@ def init_profiles_api(
611633
org_crawl_dep = org_ops.org_crawl_dep
612634

613635
async def browser_get_metadata(
614-
browserid: str, org: Organization = Depends(org_crawl_dep)
615-
):
636+
browserid: str, org: Organization
637+
) -> ProfileBrowserMetadata:
616638
# if await ops.redis.hget(f"br:{browserid}", "org") != str(org.id):
617-
metadata = await crawl_manager.get_profile_browser_metadata(browserid)
618-
if metadata.get("btrix.org") != str(org.id):
639+
metadata = None
640+
try:
641+
metadata = await crawl_manager.get_profile_browser_metadata(browserid)
642+
# pylint: disable=raise-missing-from
643+
except Exception as e:
644+
print(e)
645+
raise HTTPException(status_code=400, detail="invalid_profile_browser")
646+
647+
if metadata.oid != str(org.id):
619648
raise HTTPException(status_code=404, detail="no_such_browser")
620649

621650
return metadata
622651

652+
async def browser_metadata_dep(
653+
browserid: str, org: Organization = Depends(org_crawl_dep)
654+
):
655+
return await browser_get_metadata(browserid, org)
656+
623657
async def browser_dep(browserid: str, org: Organization = Depends(org_crawl_dep)):
624658
await browser_get_metadata(browserid, org)
625659
return browserid
@@ -673,8 +707,6 @@ async def commit_browser_to_existing(
673707
browserid=browser_commit.browserid,
674708
name=browser_commit.name,
675709
description=browser_commit.description or profile.description,
676-
crawlerChannel=profile.crawlerChannel,
677-
proxyId=profile.proxyId,
678710
),
679711
org=org,
680712
user=user,
@@ -707,8 +739,11 @@ async def create_new(
707739
return await ops.create_new_browser(org, user, profile_launch)
708740

709741
@router.post("/browser/{browserid}/ping", response_model=ProfilePingResponse)
710-
async def ping_profile_browser(browserid: str = Depends(browser_dep)):
711-
return await ops.ping_profile_browser(browserid)
742+
async def ping_profile_browser(
743+
metadata: ProfileBrowserMetadata = Depends(browser_metadata_dep),
744+
org: Organization = Depends(org_crawl_dep),
745+
):
746+
return await ops.ping_profile_browser(metadata, org)
712747

713748
@router.post("/browser/{browserid}/navigate", response_model=SuccessResponse)
714749
async def navigate_profile_browser(

0 commit comments

Comments
 (0)