Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio

from crawlee import SkippedReason
from crawlee import Request, SkippedReason
from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
Expand All @@ -18,7 +18,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
# highlight-start
# This handler is called when a request is skipped
@crawler.on_skipped_request
async def skipped_request_handler(url: str, reason: SkippedReason) -> None:
async def skipped_request_handler(request: Request, reason: SkippedReason) -> None:
url = request.url

# Check if the request was skipped due to robots.txt rules
if reason == 'robots_txt':
crawler.log.info(f'Skipped {url} due to robots.txt rules.')
Expand Down
39 changes: 26 additions & 13 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,18 @@ async def extract_links(
**kwargs: Unpack[EnqueueLinksKwargs],
) -> list[Request]:
requests = list[Request]()
skipped = list[Request]()

def create_request(request_options: RequestOptions) -> Request | None:
try:
return Request.from_url(**request_options)
except ValidationError as exc:
context.log.debug(
f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. '
'This may be caused by a malformed URL or unsupported URL scheme. '
'Please ensure the URL is correct and retry.'
)
return None

base_user_data = user_data or {}

Expand All @@ -226,11 +238,19 @@ async def extract_links(
else context.request.loaded_url or context.request.url
)
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
skipped_iterator = iter([])

if robots_txt_file:
skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
else:
skipped = iter([])
skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)

for url in skipped_iterator:
request_options = RequestOptions(
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
)
request = create_request(request_options)

if request is not None:
skipped.append(request)

for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
request_options = RequestOptions(
Expand All @@ -244,17 +264,10 @@ async def extract_links(
if transform_request_options != 'unchanged':
request_options = transform_request_options

try:
request = Request.from_url(**request_options)
except ValidationError as exc:
context.log.debug(
f'Skipping URL "{url}" due to invalid format: {exc}. '
'This may be caused by a malformed URL or unsupported URL scheme. '
'Please ensure the URL is correct and retry.'
)
continue
request = create_request(request_options)

requests.append(request)
if request is not None:
requests.append(request)

skipped_tasks = [
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
Expand Down
8 changes: 3 additions & 5 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@

ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
SkippedRequestCallback = Callable[[Request, SkippedReason], Awaitable[None]]


class _BasicCrawlerOptions(TypedDict):
Expand Down Expand Up @@ -1210,17 +1210,15 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling
raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e

async def _handle_skipped_request(
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
self, request: Request, reason: SkippedReason, *, need_mark: bool = False
) -> None:
if need_mark and isinstance(request, Request):
request.state = RequestState.SKIPPED
await self._mark_request_as_handled(request)

url = request.url if isinstance(request, Request) else request

if self._on_skipped_request:
try:
await self._on_skipped_request(url, reason)
await self._on_skipped_request(request, reason)
except Exception as e:
raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e

Expand Down
40 changes: 27 additions & 13 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,18 @@ async def extract_links(
The `PlaywrightCrawler` implementation of the `ExtractLinksFunction` function.
"""
requests = list[Request]()
skipped = list[Request]()

def create_request(request_options: RequestOptions) -> Request | None:
try:
return Request.from_url(**request_options)
except ValidationError as exc:
context.log.debug(
f'Skipping URL "{request_options["url"]}" due to invalid format: {exc}. '
'This may be caused by a malformed URL or unsupported URL scheme. '
'Please ensure the URL is correct and retry.'
)
return None

base_user_data = user_data or {}

Expand All @@ -478,10 +490,19 @@ async def extract_links(

links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)

skipped_iterator = iter([])

if robots_txt_file:
skipped, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)
else:
skipped = iter([])
skipped_iterator, links_iterator = partition(robots_txt_file.is_allowed, links_iterator)

for url in skipped_iterator:
request_options = RequestOptions(
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
)
request = create_request(request_options)

if request is not None:
skipped.append(request)

for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
request_options = RequestOptions(
Expand All @@ -495,17 +516,10 @@ async def extract_links(
if transform_request_options != 'unchanged':
request_options = transform_request_options

try:
request = Request.from_url(**request_options)
except ValidationError as exc:
context.log.debug(
f'Skipping URL "{url}" due to invalid format: {exc}. '
'This may be caused by a malformed URL or unsupported URL scheme. '
'Please ensure the URL is correct and retry.'
)
continue
request = create_request(request_options)

requests.append(request)
if request is not None:
requests.append(request)

skipped_tasks = [
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
Expand Down
22 changes: 13 additions & 9 deletions tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,18 +246,22 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await context.enqueue_links()

@crawler.on_skipped_request
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
skip(url)
async def skipped_hook(request: Request, _reason: SkippedReason) -> None:
skip(request)

await crawler.run([str(server_url / 'start_enqueue')])

expected_skip_calls = [
mock.call(str(server_url / 'page_1')),
mock.call(str(server_url / 'page_2')),
mock.call(str(server_url / 'page_3')),
mock.call(str(server_url / 'page_4')),
]
skip.assert_has_calls(expected_skip_calls, any_order=True)
expected_skip_urls = {
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
}

requests = [call.args[0] for call in skip.call_args_list]

all(isinstance(request, Request) for request in requests)
assert {request.url for request in requests} == expected_skip_urls


async def test_extract_links(server_url: URL, http_client: HttpClient) -> None:
Expand Down
22 changes: 13 additions & 9 deletions tests/unit/crawlers/_parsel/test_parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,18 +330,22 @@ async def request_handler(context: ParselCrawlingContext) -> None:
await context.enqueue_links()

@crawler.on_skipped_request
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
skip(url)
async def skipped_hook(request: Request, _reason: SkippedReason) -> None:
skip(request)

await crawler.run([str(server_url / 'start_enqueue')])

expected_skip_calls = [
mock.call(str(server_url / 'page_1')),
mock.call(str(server_url / 'page_2')),
mock.call(str(server_url / 'page_3')),
mock.call(str(server_url / 'page_4')),
]
skip.assert_has_calls(expected_skip_calls, any_order=True)
expected_skip_urls = {
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
}

requests = [call.args[0] for call in skip.call_args_list]

all(isinstance(request, Request) for request in requests)
assert {request.url for request in requests} == expected_skip_urls


async def test_extract_links(server_url: URL, http_client: HttpClient) -> None:
Expand Down
22 changes: 13 additions & 9 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -765,18 +765,22 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
await context.enqueue_links()

@crawler.on_skipped_request
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
skip(url)
async def skipped_hook(request: Request, _reason: SkippedReason) -> None:
skip(request)

await crawler.run([str(server_url / 'start_enqueue')])

expected_skip_calls = [
mock.call(str(server_url / 'page_1')),
mock.call(str(server_url / 'page_2')),
mock.call(str(server_url / 'page_3')),
mock.call(str(server_url / 'page_4')),
]
skip.assert_has_calls(expected_skip_calls, any_order=True)
expected_skip_urls = {
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
}

requests = [call.args[0] for call in skip.call_args_list]

all(isinstance(request, Request) for request in requests)
assert {request.url for request in requests} == expected_skip_urls


async def test_send_request(server_url: URL) -> None:
Expand Down