Skip to content

Commit aa03965

Browse files
authored
Ensure that profile and crawlconfig sorting of names and URLs is case-insensitive (#3016)
Fixes #3012 - Add profile indexes - Add case-insensitive collation to indexes for crawlconfigs and profile fields that should be sorted regardless of case (e.g. name, tags, URLs) - Update tests
1 parent 0f30651 commit aa03965

File tree

12 files changed

+106
-27
lines changed

12 files changed

+106
-27
lines changed

backend/btrixcloud/colls.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import asyncio
1313
import pymongo
1414
import aiohttp
15-
from pymongo.collation import Collation
1615
from fastapi import Depends, HTTPException, Response
1716
from fastapi.responses import StreamingResponse
1817
from starlette.requests import Request
@@ -55,6 +54,7 @@
5554
slug_from_name,
5655
get_duplicate_key_error_field,
5756
get_origin,
57+
case_insensitive_collation,
5858
)
5959

6060
if TYPE_CHECKING:
@@ -104,7 +104,6 @@ def set_page_ops(self, ops):
104104

105105
async def init_index(self):
106106
"""init lookup index"""
107-
case_insensitive_collation = Collation(locale="en", strength=1)
108107
await self.collections.create_index(
109108
[("oid", pymongo.ASCENDING), ("name", pymongo.ASCENDING)],
110109
unique=True,
@@ -536,7 +535,7 @@ async def list_collections(
536535
)
537536

538537
cursor = self.collections.aggregate(
539-
aggregate, collation=pymongo.collation.Collation(locale="en")
538+
aggregate, collation=case_insensitive_collation
540539
)
541540
results = await cursor.to_list(length=1)
542541
result = results[0]

backend/btrixcloud/crawlconfigs.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
validate_language_code,
7373
is_url,
7474
browser_windows_from_scale,
75+
case_insensitive_collation,
7576
)
7677

7778
if TYPE_CHECKING:
@@ -207,7 +208,18 @@ async def init_index(self):
207208
)
208209

209210
await self.crawl_configs.create_index(
210-
[("oid", pymongo.ASCENDING), ("tags", pymongo.ASCENDING)]
211+
[("oid", pymongo.ASCENDING), ("tags", pymongo.ASCENDING)],
212+
collation=case_insensitive_collation,
213+
)
214+
215+
await self.crawl_configs.create_index(
216+
[("oid", pymongo.ASCENDING), ("name", pymongo.ASCENDING)],
217+
collation=case_insensitive_collation,
218+
)
219+
220+
await self.crawl_configs.create_index(
221+
[("oid", pymongo.ASCENDING), ("firstSeed", pymongo.ASCENDING)],
222+
collation=case_insensitive_collation,
211223
)
212224

213225
await self.crawl_configs.create_index(
@@ -846,7 +858,9 @@ async def get_crawl_configs(
846858
]
847859
)
848860

849-
cursor = self.crawl_configs.aggregate(aggregate)
861+
cursor = self.crawl_configs.aggregate(
862+
aggregate, collation=case_insensitive_collation
863+
)
850864
results = await cursor.to_list(length=1)
851865
result = results[0]
852866
items = result["items"]

backend/btrixcloud/db.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,13 @@
2929
from .background_jobs import BackgroundJobOps
3030
from .file_uploads import FileUploadOps
3131
from .crawlmanager import CrawlManager
32+
from .profiles import ProfileOps
3233
else:
3334
UserManager = OrgOps = CrawlConfigOps = CrawlOps = CollectionOps = InviteOps = (
3435
StorageOps
35-
) = PageOps = BackgroundJobOps = FileUploadOps = CrawlLogOps = CrawlManager = object
36+
) = PageOps = BackgroundJobOps = FileUploadOps = CrawlLogOps = CrawlManager = (
37+
ProfileOps
38+
) = object
3639

3740

3841
CURR_DB_VERSION = "0054"
@@ -103,6 +106,7 @@ async def update_and_prepare_db(
103106
background_job_ops: BackgroundJobOps,
104107
file_ops: FileUploadOps,
105108
crawl_log_ops: CrawlLogOps,
109+
profile_ops: ProfileOps,
106110
crawl_manager: CrawlManager,
107111
) -> None:
108112
"""Prepare database for application.
@@ -139,6 +143,7 @@ async def update_and_prepare_db(
139143
storage_ops,
140144
file_ops,
141145
crawl_log_ops,
146+
profile_ops,
142147
)
143148
await user_manager.create_super_user()
144149
await org_ops.create_default_org()
@@ -262,6 +267,7 @@ async def create_indexes(
262267
storage_ops,
263268
file_ops,
264269
crawl_log_ops,
270+
profile_ops,
265271
):
266272
"""Create database indexes."""
267273
print("Creating database indexes", flush=True)
@@ -275,6 +281,7 @@ async def create_indexes(
275281
await storage_ops.init_index()
276282
await file_ops.init_index()
277283
await crawl_log_ops.init_index()
284+
await profile_ops.init_index()
278285

279286

280287
# ============================================================================

backend/btrixcloud/main_migrations.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ async def main() -> int:
2929
_,
3030
page_ops,
3131
coll_ops,
32-
_,
32+
profile_ops,
3333
storage_ops,
3434
background_job_ops,
3535
_,
@@ -55,6 +55,7 @@ async def main() -> int:
5555
background_job_ops,
5656
file_ops,
5757
crawl_log_ops,
58+
profile_ops,
5859
crawl_manager,
5960
)
6061

backend/btrixcloud/migrations/migration_0039_coll_slugs.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@
55
from uuid import UUID
66

77
from pymongo.errors import DuplicateKeyError
8-
from pymongo.collation import Collation
98
import pymongo
109

1110
from btrixcloud.migrations import BaseMigration
12-
from btrixcloud.utils import slug_from_name
11+
from btrixcloud.utils import slug_from_name, case_insensitive_collation
1312

1413
MIGRATION_VERSION = "0039"
1514

@@ -53,7 +52,6 @@ async def migrate_up(self):
5352
Add slug to collections that don't have one yet, based on name
5453
"""
5554
colls_mdb = self.mdb["collections"]
56-
case_insensitive_collation = Collation(locale="en", strength=1)
5755

5856
await colls_mdb.drop_indexes()
5957

backend/btrixcloud/orgs.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
from pydantic import ValidationError
1818
from pymongo import ReturnDocument
19-
from pymongo.collation import Collation
2019
from pymongo.errors import AutoReconnect, DuplicateKeyError
2120

2221
from fastapi import APIRouter, Depends, HTTPException, Request
@@ -93,6 +92,7 @@
9392
validate_language_code,
9493
JSONSerializer,
9594
browser_windows_from_scale,
95+
case_insensitive_collation,
9696
)
9797

9898
if TYPE_CHECKING:
@@ -189,7 +189,6 @@ def set_default_primary_storage(self, storage: StorageRef):
189189

190190
async def init_index(self) -> None:
191191
"""init lookup index"""
192-
case_insensitive_collation = Collation(locale="en", strength=1)
193192
while True:
194193
try:
195194
await self.orgs.create_index(

backend/btrixcloud/profiles.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
from fastapi import APIRouter, Depends, Request, HTTPException, Query
2222
from starlette.requests import Headers
23-
from pymongo import ReturnDocument
23+
import pymongo
2424
import aiohttp
2525

2626
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
@@ -48,7 +48,7 @@
4848
ListFilterType,
4949
ProfileSearchValuesResponse,
5050
)
51-
from .utils import dt_now, str_to_date
51+
from .utils import dt_now, str_to_date, case_insensitive_collation
5252

5353
if TYPE_CHECKING:
5454
from .orgs import OrgOps
@@ -106,6 +106,26 @@ def set_crawlconfigs(self, crawlconfigs):
106106
"""set crawlconfigs ops"""
107107
self.crawlconfigs = crawlconfigs
108108

109+
async def init_index(self):
110+
"""init lookup index"""
111+
await self.profiles.create_index(
112+
[("oid", pymongo.ASCENDING), ("name", pymongo.ASCENDING)],
113+
collation=case_insensitive_collation,
114+
)
115+
116+
await self.profiles.create_index(
117+
[("oid", pymongo.ASCENDING), ("url", pymongo.ASCENDING)],
118+
collation=case_insensitive_collation,
119+
)
120+
121+
await self.profiles.create_index(
122+
[("oid", pymongo.ASCENDING), ("created", pymongo.ASCENDING)]
123+
)
124+
125+
await self.profiles.create_index(
126+
[("oid", pymongo.ASCENDING), ("modified", pymongo.ASCENDING)]
127+
)
128+
109129
async def create_new_browser(
110130
self, org: Organization, user: User, profile_launch: ProfileLaunchBrowserIn
111131
) -> BrowserId:
@@ -415,7 +435,7 @@ async def update_profile_from_crawl_upload(
415435
"modifiedCrawlCid": cid,
416436
}
417437
},
418-
return_document=ReturnDocument.BEFORE,
438+
return_document=pymongo.ReturnDocument.BEFORE,
419439
)
420440
if not res:
421441
return False
@@ -494,7 +514,9 @@ async def list_profiles(
494514
]
495515
)
496516

497-
cursor = self.profiles.aggregate(aggregate)
517+
cursor = self.profiles.aggregate(
518+
aggregate, collation=case_insensitive_collation
519+
)
498520
results = await cursor.to_list(length=1)
499521
result = results[0]
500522
items = result["items"]

backend/btrixcloud/utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from fastapi import HTTPException
1919
from fastapi.responses import StreamingResponse
2020
from iso639 import is_language
21+
from pymongo.collation import Collation
2122
from pymongo.errors import DuplicateKeyError
2223
from slugify import slugify
2324

@@ -26,6 +27,8 @@
2627

2728
browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
2829

30+
case_insensitive_collation = Collation(locale="en", strength=1)
31+
2932

3033
class JSONSerializer(json.JSONEncoder):
3134
"""Serializer class for json.dumps with UUID and datetime support"""

backend/test/conftest.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def _crawler_create_config_only(crawler_auth_headers, default_org_id):
207207
# Start crawl.
208208
crawl_data = {
209209
"runNow": False,
210-
"name": "Crawler User Test Crawl",
210+
"name": "crawler User Test Crawl",
211211
"description": "crawler test crawl",
212212
"config": {
213213
"seeds": [{"url": "https://old.webrecorder.net/"}],
@@ -232,7 +232,7 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id):
232232
# Start crawl.
233233
crawl_data = {
234234
"runNow": True,
235-
"name": "Crawler User Test Crawl",
235+
"name": "crawler User Test Crawl",
236236
"description": "crawler test crawl",
237237
"tags": ["wr-test-2"],
238238
"config": {"seeds": [{"url": "https://old.webrecorder.net/"}], "limit": 3},
@@ -702,7 +702,7 @@ def echo_server():
702702
PROFILE_DESC_UPDATED = "Updated profile used for backend tests"
703703
PROFILE_TAGS_UPDATED = ["profile", "profile-updated", "old-webrecorder"]
704704

705-
PROFILE_2_NAME = "Second test profile"
705+
PROFILE_2_NAME = "second test profile"
706706
PROFILE_2_DESC = "Second profile used to test list endpoint"
707707
PROFILE_2_TAGS = ["profile", "specs-webrecorder"]
708708

backend/test/test_crawl_config_search_values.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def test_get_search_values_1(admin_auth_headers, default_org_id):
4444
)
4545
data = r.json()
4646
assert sorted(data["names"]) == sorted(
47-
[NAME_1, "Admin Test Crawl", "Crawler User Test Crawl"]
47+
[NAME_1, "Admin Test Crawl", "crawler User Test Crawl"]
4848
)
4949
assert sorted(data["descriptions"]) == sorted(
5050
["Admin Test Crawl description", "crawler test crawl", DESCRIPTION_1]
@@ -74,7 +74,7 @@ def test_get_search_values_2(admin_auth_headers, default_org_id):
7474
)
7575
data = r.json()
7676
assert sorted(data["names"]) == sorted(
77-
[NAME_1, NAME_2, "Admin Test Crawl", "Crawler User Test Crawl"]
77+
[NAME_1, NAME_2, "Admin Test Crawl", "crawler User Test Crawl"]
7878
)
7979
assert sorted(data["descriptions"]) == sorted(
8080
[
@@ -111,7 +111,7 @@ def test_get_search_values_3(admin_auth_headers, default_org_id):
111111
)
112112
data = r.json()
113113
assert sorted(data["names"]) == sorted(
114-
[NAME_1, NAME_2, "Admin Test Crawl", "Crawler User Test Crawl"]
114+
[NAME_1, NAME_2, "Admin Test Crawl", "crawler User Test Crawl"]
115115
)
116116
assert sorted(data["descriptions"]) == sorted(
117117
[

0 commit comments

Comments
 (0)