From 32b97558a9fe70c401ba90182d51729eab8719ed Mon Sep 17 00:00:00 2001 From: Lorenz Braun Date: Sun, 31 May 2026 15:49:08 +0200 Subject: [PATCH] feat: provide Request instances in skipped request callbacks --- .../respect_robots_on_skipped_request.py | 6 ++- .../_abstract_http/_abstract_http_crawler.py | 39 ++++++++++++------ src/crawlee/crawlers/_basic/_basic_crawler.py | 8 ++-- .../_playwright/_playwright_crawler.py | 40 +++++++++++++------ .../test_beautifulsoup_crawler.py | 22 +++++----- .../crawlers/_parsel/test_parsel_crawler.py | 22 +++++----- .../_playwright/test_playwright_crawler.py | 22 +++++----- 7 files changed, 99 insertions(+), 60 deletions(-) diff --git a/docs/examples/code_examples/respect_robots_on_skipped_request.py b/docs/examples/code_examples/respect_robots_on_skipped_request.py index 5c7eca173f..8b9cf7b18e 100644 --- a/docs/examples/code_examples/respect_robots_on_skipped_request.py +++ b/docs/examples/code_examples/respect_robots_on_skipped_request.py @@ -1,6 +1,6 @@ import asyncio -from crawlee import SkippedReason +from crawlee import Request, SkippedReason from crawlee.crawlers import ( BeautifulSoupCrawler, BeautifulSoupCrawlingContext, @@ -18,7 +18,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: # highlight-start # This handler is called when a request is skipped @crawler.on_skipped_request - async def skipped_request_handler(url: str, reason: SkippedReason) -> None: + async def skipped_request_handler(request: Request, reason: SkippedReason) -> None: + url = request.url + # Check if the request was skipped due to robots.txt rules if reason == 'robots_txt': crawler.log.info(f'Skipped {url} due to robots.txt rules.') diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 8d15a1d801..1f5e1be924 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -206,6 +206,18 @@ async def extract_links( **kwargs: Unpack[EnqueueLinksKwargs], ) -> list[Request]: requests = list[Request]() + skipped = list[Request]() + + def create_request(request_options: RequestOptions) -> Request | None: + try: + return Request.from_url(**request_options) + except ValidationError as exc: + context.log.debug( + f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. ' + 'This may be caused by a malformed URL or unsupported URL scheme. ' + 'Please ensure the URL is correct and retry.' + ) + return None base_user_data = user_data or {} @@ -226,11 +238,19 @@ async def extract_links( else context.request.loaded_url or context.request.url ) links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log) + skipped_iterator = iter([]) if robots_txt_file: - skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) - else: - skipped = iter([]) + skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) + + for url in skipped_iterator: + request_options = RequestOptions( + url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy + ) + request = create_request(request_options) + + if request is not None: + skipped.append(request) for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs): request_options = RequestOptions( @@ -244,17 +264,10 @@ async def extract_links( if transform_request_options != 'unchanged': request_options = transform_request_options - try: - request = Request.from_url(**request_options) - except ValidationError as exc: - context.log.debug( - f'Skipping URL "{url}" due to invalid format: {exc}. ' - 'This may be caused by a malformed URL or unsupported URL scheme. ' - 'Please ensure the URL is correct and retry.' - ) - continue + request = create_request(request_options) - requests.append(request) + if request is not None: + requests.append(request) skipped_tasks = [ asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index be3da6dd27..7d0c6f842e 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -110,7 +110,7 @@ ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]] FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]] -SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]] +SkippedRequestCallback = Callable[[Request, SkippedReason], Awaitable[None]] class _BasicCrawlerOptions(TypedDict): @@ -1210,17 +1210,15 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e async def _handle_skipped_request( - self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False + self, request: Request, reason: SkippedReason, *, need_mark: bool = False ) -> None: if need_mark and isinstance(request, Request): request.state = RequestState.SKIPPED await self._mark_request_as_handled(request) - url = request.url if isinstance(request, Request) else request - if self._on_skipped_request: try: - await self._on_skipped_request(url, reason) + await self._on_skipped_request(request, reason) except Exception as e: raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 87a79eb13c..fde3b51376 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -459,6 +459,18 @@ async def extract_links( The `PlaywrightCrawler` implementation of the `ExtractLinksFunction` function. """ requests = list[Request]() + skipped = list[Request]() + + def create_request(request_options: RequestOptions) -> Request | None: + try: + return Request.from_url(**request_options) + except ValidationError as exc: + context.log.debug( + f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. ' + 'This may be caused by a malformed URL or unsupported URL scheme. ' + 'Please ensure the URL is correct and retry.' + ) + return None base_user_data = user_data or {} @@ -478,10 +490,19 @@ async def extract_links( links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log) + skipped_iterator = iter([]) + if robots_txt_file: - skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) - else: - skipped = iter([]) + skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator) + + for url in skipped_iterator: + request_options = RequestOptions( + url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy + ) + request = create_request(request_options) + + if request is not None: + skipped.append(request) for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs): request_options = RequestOptions( @@ -495,17 +516,10 @@ async def extract_links( if transform_request_options != 'unchanged': request_options = transform_request_options - try: - request = Request.from_url(**request_options) - except ValidationError as exc: - context.log.debug( - f'Skipping URL "{url}" due to invalid format: {exc}. ' - 'This may be caused by a malformed URL or unsupported URL scheme. ' - 'Please ensure the URL is correct and retry.' - ) - continue + request = create_request(request_options) - requests.append(request) + if request is not None: + requests.append(request) skipped_tasks = [ asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 9a828b0078..a1a796078d 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -246,18 +246,22 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await context.enqueue_links() @crawler.on_skipped_request - async def skipped_hook(url: str, _reason: SkippedReason) -> None: - skip(url) + async def skipped_hook(request: Request, _reason: SkippedReason) -> None: + skip(request) await crawler.run([str(server_url / 'start_enqueue')]) - expected_skip_calls = [ - mock.call(str(server_url / 'page_1')), - mock.call(str(server_url / 'page_2')), - mock.call(str(server_url / 'page_3')), - mock.call(str(server_url / 'page_4')), - ] - skip.assert_has_calls(expected_skip_calls, any_order=True) + expected_skip_urls = { + str(server_url / 'page_1'), + str(server_url / 'page_2'), + str(server_url / 'page_3'), + str(server_url / 'page_4'), + } + + requests = [call.args[0] for call in skip.call_args_list] + + all(isinstance(request, Request) for request in requests) + assert {request.url for request in requests} == expected_skip_urls async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 02f5b61a86..8fab2a7637 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -330,18 +330,22 @@ async def request_handler(context: ParselCrawlingContext) -> None: await context.enqueue_links() @crawler.on_skipped_request - async def skipped_hook(url: str, _reason: SkippedReason) -> None: - skip(url) + async def skipped_hook(request: Request, _reason: SkippedReason) -> None: + skip(request) await crawler.run([str(server_url / 'start_enqueue')]) - expected_skip_calls = [ - mock.call(str(server_url / 'page_1')), - mock.call(str(server_url / 'page_2')), - mock.call(str(server_url / 'page_3')), - mock.call(str(server_url / 'page_4')), - ] - skip.assert_has_calls(expected_skip_calls, any_order=True) + expected_skip_urls = { + str(server_url / 'page_1'), + str(server_url / 'page_2'), + str(server_url / 'page_3'), + str(server_url / 'page_4'), + } + + requests = [call.args[0] for call in skip.call_args_list] + + all(isinstance(request, Request) for request in requests) + assert {request.url for request in requests} == expected_skip_urls async def test_extract_links(server_url: URL, http_client: HttpClient) -> None: diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 78d1789f99..92f6affeb2 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -765,18 +765,22 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: await context.enqueue_links() @crawler.on_skipped_request - async def skipped_hook(url: str, _reason: SkippedReason) -> None: - skip(url) + async def skipped_hook(request: Request, _reason: SkippedReason) -> None: + skip(request) await crawler.run([str(server_url / 'start_enqueue')]) - expected_skip_calls = [ - mock.call(str(server_url / 'page_1')), - mock.call(str(server_url / 'page_2')), - mock.call(str(server_url / 'page_3')), - mock.call(str(server_url / 'page_4')), - ] - skip.assert_has_calls(expected_skip_calls, any_order=True) + expected_skip_urls = { + str(server_url / 'page_1'), + str(server_url / 'page_2'), + str(server_url / 'page_3'), + str(server_url / 'page_4'), + } + + requests = [call.args[0] for call in skip.call_args_list] + + all(isinstance(request, Request) for request in requests) + assert {request.url for request in requests} == expected_skip_urls async def test_send_request(server_url: URL) -> None: