-
Notifications
You must be signed in to change notification settings - Fork 704
fix: Defer page object cleanup to make it accessible in error handlers #1814
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1413,6 +1413,8 @@ async def __run_task_function(self) -> None: | |
| proxy_info = await self._get_proxy_info(request, session) | ||
| result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request) | ||
|
|
||
| deferred_cleanup: list[Callable[[], Awaitable[None]]] = [] | ||
|
|
||
| context = BasicCrawlingContext( | ||
| request=result.request, | ||
| session=session, | ||
|
|
@@ -1423,6 +1425,7 @@ async def __run_task_function(self) -> None: | |
| get_key_value_store=result.get_key_value_store, | ||
| use_state=self._use_state, | ||
| log=self._logger, | ||
| register_deferred_cleanup=deferred_cleanup.append, | ||
| ) | ||
| self._context_result_map[context] = result | ||
|
|
||
|
|
@@ -1509,6 +1512,13 @@ async def __run_task_function(self) -> None: | |
| ) | ||
| raise | ||
|
|
||
| finally: | ||
| for cleanup in deferred_cleanup: | ||
| try: | ||
| await cleanup() | ||
| except Exception: # noqa: PERF203 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we put the entire loop into the try-except block as the lint rule suggests?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would mean that if one cleanup hook fails, all those that follow it will, too. I wouldn't worry about performance too much here, there won't be more than one for 99% of users. |
||
| self._logger.exception('Error in deferred cleanup') | ||
|
|
||
| async def _run_request_handler(self, context: BasicCrawlingContext) -> None: | ||
| context.request.state = RequestState.BEFORE_NAV | ||
| await self._context_pipeline( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import asyncio | ||
| import contextlib | ||
| import logging | ||
| import warnings | ||
| from datetime import timedelta | ||
|
|
@@ -236,6 +237,7 @@ async def _open_page( | |
| proxy_info=context.proxy_info, | ||
| get_key_value_store=context.get_key_value_store, | ||
| log=context.log, | ||
| register_deferred_cleanup=context.register_deferred_cleanup, | ||
| page=crawlee_page.page, | ||
| block_requests=partial(block_requests, page=crawlee_page.page), | ||
| goto_options=GotoOptions(**self._goto_options), | ||
|
|
@@ -296,63 +298,73 @@ async def _navigate( | |
| The enhanced crawling context with the Playwright-specific features (page, response, enqueue_links, | ||
| infinite_scroll and block_requests). | ||
| """ | ||
| async with context.page: | ||
| if context.session: | ||
| session_cookies = context.session.cookies.get_cookies_as_playwright_format() | ||
| await self._update_cookies(context.page, session_cookies) | ||
|
|
||
| if context.request.headers: | ||
| await context.page.set_extra_http_headers(context.request.headers.model_dump()) | ||
| # Navigate to the URL and get response. | ||
| if context.request.method != 'GET': | ||
| # Call the notification only once | ||
| warnings.warn( | ||
| 'Using other request methods than GET or adding payloads has a high impact on performance' | ||
| ' in recent versions of Playwright. Use only when necessary.', | ||
| category=UserWarning, | ||
| stacklevel=2, | ||
| ) | ||
| # Enter the page context manager, but defer its cleanup (page.close()) so the page stays open | ||
| # during error handler execution. | ||
| await context.page.__aenter__() | ||
|
|
||
| route_handler = self._prepare_request_interceptor( | ||
| method=context.request.method, | ||
| headers=context.request.headers, | ||
| payload=context.request.payload, | ||
| ) | ||
| async def _close_page() -> None: | ||
| with contextlib.suppress(Exception): | ||
| await context.page.__aexit__(None, None, None) | ||
|
Comment on lines
+306
to
+307
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we really suppress all exceptions completely? Maybe we could at least log them? In the Just an idea, maybe there is a good reason to completely suppress all exceptions on this level. |
||
|
|
||
| # Set route_handler only for current request | ||
| await context.page.route(context.request.url, route_handler) | ||
| context.register_deferred_cleanup(_close_page) | ||
|
|
||
| try: | ||
| async with self._shared_navigation_timeouts[id(context)] as remaining_timeout: | ||
| response = await context.page.goto( | ||
| context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options | ||
| ) | ||
| context.request.state = RequestState.AFTER_NAV | ||
| except playwright.async_api.TimeoutError as exc: | ||
| raise asyncio.TimeoutError from exc | ||
|
|
||
| if response is None: | ||
| raise SessionError(f'Failed to load the URL: {context.request.url}') | ||
|
|
||
| # Set the loaded URL to the actual URL after redirection. | ||
| context.request.loaded_url = context.page.url | ||
|
|
||
| yield PlaywrightPostNavCrawlingContext( | ||
| request=context.request, | ||
| session=context.session, | ||
| add_requests=context.add_requests, | ||
| send_request=context.send_request, | ||
| push_data=context.push_data, | ||
| use_state=context.use_state, | ||
| proxy_info=context.proxy_info, | ||
| get_key_value_store=context.get_key_value_store, | ||
| log=context.log, | ||
| page=context.page, | ||
| block_requests=context.block_requests, | ||
| goto_options=context.goto_options, | ||
| response=response, | ||
| if context.session: | ||
| session_cookies = context.session.cookies.get_cookies_as_playwright_format() | ||
| await self._update_cookies(context.page, session_cookies) | ||
|
|
||
| if context.request.headers: | ||
| await context.page.set_extra_http_headers(context.request.headers.model_dump()) | ||
| # Navigate to the URL and get response. | ||
| if context.request.method != 'GET': | ||
| # Call the notification only once | ||
| warnings.warn( | ||
| 'Using other request methods than GET or adding payloads has a high impact on performance' | ||
| ' in recent versions of Playwright. Use only when necessary.', | ||
| category=UserWarning, | ||
| stacklevel=2, | ||
| ) | ||
|
|
||
| route_handler = self._prepare_request_interceptor( | ||
| method=context.request.method, | ||
| headers=context.request.headers, | ||
| payload=context.request.payload, | ||
| ) | ||
|
|
||
| # Set route_handler only for current request | ||
| await context.page.route(context.request.url, route_handler) | ||
|
|
||
| try: | ||
| async with self._shared_navigation_timeouts[id(context)] as remaining_timeout: | ||
| response = await context.page.goto( | ||
| context.request.url, timeout=remaining_timeout.total_seconds() * 1000, **context.goto_options | ||
| ) | ||
| context.request.state = RequestState.AFTER_NAV | ||
| except playwright.async_api.TimeoutError as exc: | ||
| raise asyncio.TimeoutError from exc | ||
|
|
||
| if response is None: | ||
| raise SessionError(f'Failed to load the URL: {context.request.url}') | ||
|
|
||
| # Set the loaded URL to the actual URL after redirection. | ||
| context.request.loaded_url = context.page.url | ||
|
|
||
| yield PlaywrightPostNavCrawlingContext( | ||
| request=context.request, | ||
| session=context.session, | ||
| add_requests=context.add_requests, | ||
| send_request=context.send_request, | ||
| push_data=context.push_data, | ||
| use_state=context.use_state, | ||
| proxy_info=context.proxy_info, | ||
| get_key_value_store=context.get_key_value_store, | ||
| log=context.log, | ||
| register_deferred_cleanup=context.register_deferred_cleanup, | ||
| page=context.page, | ||
| block_requests=context.block_requests, | ||
| goto_options=context.goto_options, | ||
| response=response, | ||
| ) | ||
|
|
||
| def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContext) -> ExtractLinksFunction: | ||
| """Create a callback function for extracting links from context. | ||
|
|
||
|
|
@@ -508,6 +520,7 @@ async def _create_crawling_context( | |
| proxy_info=context.proxy_info, | ||
| get_key_value_store=context.get_key_value_store, | ||
| log=context.log, | ||
| register_deferred_cleanup=context.register_deferred_cleanup, | ||
| page=context.page, | ||
| goto_options=context.goto_options, | ||
| response=context.response, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The type is huge, maybe we could utilize a type alias
DeferredCleanupCallback?