Skip to content

Commit de517a1

Browse files
authored
fix: respect <base> when enqueuing (#1590)
### Description - This PR focuses on ensuring that `extract_links` and `enqueue_links` respect the `<base>` tag on the page. ### Issues - Closes: #1589 ### Testing - Update tests for enqueuing
1 parent b1d6287 commit de517a1

File tree

7 files changed

+62
-5
lines changed

7 files changed

+62
-5
lines changed

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,9 +167,15 @@ async def extract_links(
167167
kwargs.setdefault('strategy', 'same-hostname')
168168

169169
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
170-
links_iterator = to_absolute_url_iterator(
171-
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
170+
171+
# Get base URL from <base> tag if present
172+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
173+
base_url: str = (
174+
str(extracted_base_urls[0])
175+
if extracted_base_urls
176+
else context.request.loaded_url or context.request.url
172177
)
178+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
173179

174180
if robots_txt_file:
175181
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -369,9 +369,12 @@ async def extract_links(
369369
links_iterator: Iterator[str] = iter(
370370
[url for element in elements if (url := await element.get_attribute('href')) is not None]
371371
)
372-
links_iterator = to_absolute_url_iterator(
373-
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
374-
)
372+
373+
# Get base URL from <base> tag if present
374+
extracted_base_url = await context.page.evaluate('document.baseURI')
375+
base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
376+
377+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
375378

376379
if robots_txt_file:
377380
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
5858
str(server_url / 'page_1'),
5959
str(server_url / 'page_2'),
6060
str(server_url / 'page_3'),
61+
str(server_url / 'page_4'),
62+
str(server_url / 'base_page'),
63+
str(server_url / 'base_subpath/page_5'),
6164
}
6265

6366

@@ -131,6 +134,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
131134
str(server_url / 'sub_index'),
132135
str(server_url / 'page_1'),
133136
str(server_url / 'page_2'),
137+
str(server_url / 'base_page'),
138+
str(server_url / 'page_4'),
139+
str(server_url / 'base_subpath/page_5'),
134140
}
135141

136142
# # all urls added to `enqueue_links` must have a custom header
@@ -164,6 +170,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
164170
assert visited == {
165171
str(server_url / 'start_enqueue'),
166172
str(server_url / 'sub_index'),
173+
str(server_url / 'base_page'),
174+
str(server_url / 'base_subpath/page_5'),
167175
}
168176

169177

@@ -221,6 +229,7 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
221229
str(server_url / 'page_1'),
222230
str(server_url / 'page_2'),
223231
str(server_url / 'page_3'),
232+
str(server_url / 'page_4'),
224233
}
225234

226235

tests/unit/crawlers/_parsel/test_parsel_crawler.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ async def request_handler(context: ParselCrawlingContext) -> None:
6161
str(server_url / 'page_1'),
6262
str(server_url / 'page_2'),
6363
str(server_url / 'page_3'),
64+
str(server_url / 'page_4'),
65+
str(server_url / 'base_page'),
66+
str(server_url / 'base_subpath/page_5'),
6467
}
6568

6669

@@ -151,6 +154,9 @@ async def request_handler(context: ParselCrawlingContext) -> None:
151154
str(server_url / 'sub_index'),
152155
str(server_url / 'page_1'),
153156
str(server_url / 'page_2'),
157+
str(server_url / 'page_4'),
158+
str(server_url / 'base_page'),
159+
str(server_url / 'base_subpath/page_5'),
154160
}
155161

156162
# # all urls added to `enqueue_links` must have a custom header
@@ -258,6 +264,8 @@ async def request_handler(context: ParselCrawlingContext) -> None:
258264
assert visited == {
259265
str(server_url / 'start_enqueue'),
260266
str(server_url / 'sub_index'),
267+
str(server_url / 'base_page'),
268+
str(server_url / 'base_subpath/page_5'),
261269
}
262270

263271

@@ -315,6 +323,7 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
315323
str(server_url / 'page_1'),
316324
str(server_url / 'page_2'),
317325
str(server_url / 'page_3'),
326+
str(server_url / 'page_4'),
318327
}
319328

320329

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
9999
str(server_url / 'page_1'),
100100
str(server_url / 'page_2'),
101101
str(server_url / 'page_3'),
102+
str(server_url / 'page_4'),
103+
str(server_url / 'base_page'),
104+
str(server_url / 'base_subpath/page_5'),
102105
}
103106

104107

@@ -668,6 +671,8 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
668671
assert visited == {
669672
str(server_url / 'start_enqueue'),
670673
str(server_url / 'sub_index'),
674+
str(server_url / 'base_page'),
675+
str(server_url / 'base_subpath/page_5'),
671676
}
672677

673678

@@ -724,6 +729,7 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
724729
str(server_url / 'page_1'),
725730
str(server_url / 'page_2'),
726731
str(server_url / 'page_3'),
732+
str(server_url / 'page_4'),
727733
}
728734

729735

tests/unit/server.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from yarl import URL
1616

1717
from tests.unit.server_endpoints import (
18+
BASE_INDEX,
1819
GENERIC_RESPONSE,
1920
HELLO_WORLD,
2021
INCAPSULA,
@@ -105,6 +106,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
105106
'page_1': generic_response_endpoint,
106107
'page_2': generic_response_endpoint,
107108
'page_3': generic_response_endpoint,
109+
'base_page': base_index_endpoint,
108110
'problematic_links': problematic_links_endpoint,
109111
'set_cookies': set_cookies,
110112
'set_complex_cookies': set_complex_cookies,
@@ -431,6 +433,16 @@ async def resource_loading_endpoint(_scope: dict[str, Any], _receive: Receive, s
431433
)
432434

433435

436+
async def base_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
437+
"""Handle requests for the base index page."""
438+
host = f'http://{get_headers_dict(_scope).get("host", "localhost")}'
439+
content = BASE_INDEX.format(host=host).encode()
440+
await send_html_response(
441+
send,
442+
content,
443+
)
444+
445+
434446
class TestServer(Server):
435447
"""A test HTTP server implementation based on Uvicorn Server."""
436448

tests/unit/server_endpoints.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,18 @@
2424
<body>
2525
<a href="/page_3">Link 3</a>
2626
<a href="/page_2">Link 4</a>
27+
<a href="/base_page">Base Page</a>
28+
</body></html>"""
29+
30+
BASE_INDEX = """\
31+
<html><head>
32+
<base href="{host}/base_subpath/">
33+
<base href="{host}/sub_index/">
34+
<title>Hello</title>
35+
</head>
36+
<body>
37+
<a href="page_5">Link 5</a>
38+
<a href="/page_4">Link 6</a>
2739
</body></html>"""
2840

2941
INCAPSULA = b"""\

0 commit comments

Comments
 (0)