Skip to content

Commit f179f86

Browse files
authored
fix: Fix non-ASCII character corruption in FileSystemStorageClient on systems without UTF-8 default encoding (#1580)
### Description - Fix non-ASCII character corruption in `FileSystemStorageClient` on systems without UTF-8 default encoding ### Issues - Closes: #1579 ### Testing - Add new tests for storage
1 parent 9d4ae64 commit f179f86

File tree

6 files changed

+62
-8
lines changed

6 files changed

+62
-8
lines changed

src/crawlee/storage_clients/_file_system/_dataset_client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ async def open(
134134
continue
135135

136136
try:
137-
file = await asyncio.to_thread(path_to_metadata.open)
137+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
138138
try:
139139
file_content = json.load(file)
140140
metadata = DatasetMetadata(**file_content)
@@ -163,7 +163,7 @@ async def open(
163163

164164
# If the dataset directory exists, reconstruct the client from the metadata file.
165165
if path_to_dataset.exists() and path_to_metadata.exists():
166-
file = await asyncio.to_thread(open, path_to_metadata)
166+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
167167
try:
168168
file_content = json.load(file)
169169
finally:

src/crawlee/storage_clients/_file_system/_key_value_store_client.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ async def open(
133133
continue
134134

135135
try:
136-
file = await asyncio.to_thread(path_to_metadata.open)
136+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
137137
try:
138138
file_content = json.load(file)
139139
metadata = KeyValueStoreMetadata(**file_content)
@@ -162,7 +162,7 @@ async def open(
162162

163163
# If the key-value store directory exists, reconstruct the client from the metadata file.
164164
if path_to_kvs.exists() and path_to_metadata.exists():
165-
file = await asyncio.to_thread(open, path_to_metadata)
165+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
166166
try:
167167
file_content = json.load(file)
168168
finally:
@@ -239,7 +239,7 @@ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
239239
# Read the metadata file
240240
async with self._lock:
241241
try:
242-
file = await asyncio.to_thread(open, record_metadata_filepath)
242+
file = await asyncio.to_thread(open, record_metadata_filepath, 'r', encoding='utf-8')
243243
except FileNotFoundError:
244244
logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
245245
return None

src/crawlee/storage_clients/_file_system/_request_queue_client.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ async def open(
197197
continue
198198

199199
try:
200-
file = await asyncio.to_thread(path_to_metadata.open)
200+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
201201
try:
202202
file_content = json.load(file)
203203
metadata = RequestQueueMetadata(**file_content)
@@ -232,7 +232,7 @@ async def open(
232232

233233
# If the RQ directory exists, reconstruct the client from the metadata file.
234234
if path_to_rq.exists() and path_to_metadata.exists():
235-
file = await asyncio.to_thread(open, path_to_metadata)
235+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
236236
try:
237237
file_content = json.load(file)
238238
finally:
@@ -775,7 +775,7 @@ async def _parse_request_file(cls, file_path: Path) -> Request | None:
775775
"""
776776
# Open the request file.
777777
try:
778-
file = await asyncio.to_thread(open, file_path)
778+
file = await asyncio.to_thread(open, file_path, 'r', encoding='utf-8')
779779
except FileNotFoundError:
780780
logger.warning(f'Request file "{file_path}" not found.')
781781
return None

tests/unit/storages/test_dataset.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,3 +1082,20 @@ async def test_validate_name(storage_client: StorageClient, name: str, *, is_val
10821082
else:
10831083
with pytest.raises(ValueError, match=rf'Invalid storage name "{name}".*'):
10841084
await Dataset.open(name=name, storage_client=storage_client)
1085+
1086+
1087+
async def test_record_with_noascii_chars(dataset: Dataset) -> None:
1088+
"""Test handling record with non-ASCII characters."""
1089+
init_value = {
1090+
'record_1': 'Supermaxi El Jardín',
1091+
'record_2': 'záznam dva',
1092+
'record_3': '記録三',
1093+
}
1094+
1095+
# Save the record to the dataset
1096+
await dataset.push_data(init_value)
1097+
1098+
# Get the record and verify
1099+
value = await dataset.get_data()
1100+
assert value is not None
1101+
assert value.items[0] == init_value

tests/unit/storages/test_key_value_store.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,3 +1132,21 @@ async def test_get_auto_saved_value_various_global_clients(
11321132
await kvs.persist_autosaved_values()
11331133

11341134
assert await kvs.get_value(test_key) == autosaved_value_kvs
1135+
1136+
1137+
async def test_record_with_noascii_chars(kvs: KeyValueStore) -> None:
1138+
"""Test storing and retrieving a record with non-ASCII characters."""
1139+
init_value = {
1140+
'record_1': 'Supermaxi El Jardín',
1141+
'record_2': 'záznam dva',
1142+
'record_3': '記録三',
1143+
}
1144+
key = 'non_ascii_key'
1145+
1146+
# Save the record in the key-value store
1147+
await kvs.set_value(key, init_value)
1148+
1149+
# Get the record and verify
1150+
value = await kvs.get_value(key)
1151+
assert value is not None
1152+
assert value == init_value

tests/unit/storages/test_request_queue.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,3 +1348,22 @@ async def test_reclaim_request_with_change_state(rq: RequestQueue) -> None:
13481348
assert reclaimed_request is not None
13491349
assert reclaimed_request.url == 'https://example.com/original'
13501350
assert reclaimed_request.user_data['state'] == 'modified'
1351+
1352+
1353+
async def test_request_with_noascii_chars(rq: RequestQueue) -> None:
1354+
"""Test handling requests with non-ASCII characters in user data."""
1355+
data_with_special_chars = {
1356+
'record_1': 'Supermaxi El Jardín',
1357+
'record_2': 'záznam dva',
1358+
'record_3': '記録三',
1359+
}
1360+
init_request = Request.from_url('https://crawlee.dev', user_data=data_with_special_chars)
1361+
1362+
# Add a request with special user data
1363+
await rq.add_request(init_request)
1364+
1365+
# Get the request and verify
1366+
request = await rq.fetch_next_request()
1367+
assert request is not None
1368+
assert request.url == 'https://crawlee.dev'
1369+
assert request.user_data == init_request.user_data

0 commit comments

Comments
 (0)