From f10cdd51c8fa5bc088da46e81d86dae40a6e6930 Mon Sep 17 00:00:00 2001 From: CloakHQ Date: Tue, 31 Mar 2026 01:19:07 +0200 Subject: [PATCH 1/2] docs: add CloakBrowser to avoid-blocking guide Add CloakBrowser as a stealth browser option in the "Avoid getting blocked" guide, alongside Camoufox. Includes integration example using BrowserPool with a custom PlaywrightBrowserPlugin. --- docs/guides/avoid_blocking.mdx | 9 +++ .../playwright_with_cloakbrowser.py | 81 +++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 docs/guides/code_examples/avoid_blocking/playwright_with_cloakbrowser.py diff --git a/docs/guides/avoid_blocking.mdx b/docs/guides/avoid_blocking.mdx index 423338dcfe..cf4c008783 100644 --- a/docs/guides/avoid_blocking.mdx +++ b/docs/guides/avoid_blocking.mdx @@ -10,6 +10,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import PlaywrightDefaultFingerprintGenerator from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_fingerprint_generator.py'; import PlaywrightWithCamoufox from '!!raw-loader!roa-loader!../examples/code_examples/playwright_crawler_with_camoufox.py'; +import PlaywrightWithCloakBrowser from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_cloakbrowser.py'; import PlaywrightDefaultFingerprintGeneratorWithArgs from '!!raw-loader!./code_examples/avoid_blocking/default_fingerprint_generator_with_args.py'; @@ -41,6 +42,14 @@ In some cases even `PlaywrightCrawler` +## Using CloakBrowser + +For sites with aggressive anti-bot protection, [CloakBrowser](https://github.com/CloakHQ/CloakBrowser) takes a different approach. Instead of overriding fingerprints at the JavaScript level (which anti-bot scripts can detect as tampering), CloakBrowser ships a custom Chromium binary with fingerprints modified directly in the C++ source code. It is also Chromium-based, which can matter when a target site behaves differently with Firefox than with Chrome. Install it separately with `pip install cloakbrowser` — the plugin calls `ensure_binary()` which automatically downloads and caches the Chromium binary on first run. + + + {PlaywrightWithCloakBrowser} + + **Related links** - [Fingerprint Suite Docs](https://github.com/apify/fingerprint-suite) diff --git a/docs/guides/code_examples/avoid_blocking/playwright_with_cloakbrowser.py b/docs/guides/code_examples/avoid_blocking/playwright_with_cloakbrowser.py new file mode 100644 index 0000000000..508acae0bb --- /dev/null +++ b/docs/guides/code_examples/avoid_blocking/playwright_with_cloakbrowser.py @@ -0,0 +1,81 @@ +import asyncio + +# CloakBrowser is an external package. Install it separately. +from cloakbrowser.config import IGNORE_DEFAULT_ARGS, get_default_stealth_args +from cloakbrowser.download import ensure_binary +from typing_extensions import override + +from crawlee.browsers import ( + BrowserPool, + PlaywrightBrowserController, + PlaywrightBrowserPlugin, +) +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +class CloakBrowserPlugin(PlaywrightBrowserPlugin): + """Example browser plugin that uses CloakBrowser's patched Chromium, + but otherwise keeps the functionality of PlaywrightBrowserPlugin. + """ + + @override + async def new_browser(self) -> PlaywrightBrowserController: + if not self._playwright: + raise RuntimeError('Playwright browser plugin is not initialized.') + + binary_path = ensure_binary() + stealth_args = get_default_stealth_args() + + # Merge CloakBrowser stealth args with any user-provided launch options. + launch_options = dict(self._browser_launch_options) + launch_options.pop('executable_path', None) + launch_options.pop('chromium_sandbox', None) + existing_args = list(launch_options.pop('args', [])) + launch_options['args'] = [*existing_args, *stealth_args] + + return PlaywrightBrowserController( + browser=await self._playwright.chromium.launch( + executable_path=binary_path, + ignore_default_args=IGNORE_DEFAULT_ARGS, + **launch_options, + ), + max_open_pages_per_browser=1, + # CloakBrowser handles fingerprints at the binary level. + header_generator=None, + ) + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + # Custom browser pool. Gives users full control over browsers used by the crawler. + browser_pool=BrowserPool(plugins=[CloakBrowserPlugin()]), + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract some data from the page using Playwright's API. + posts = await context.page.query_selector_all('.athing') + for post in posts: + # Get the HTML elements for the title and rank within each post. + title_element = await post.query_selector('.title a') + + # Extract the data we want from the elements. + title = await title_element.inner_text() if title_element else None + + # Push the extracted data to the default dataset. + await context.push_data({'title': title}) + + # Find a link to the next page and enqueue it if it exists. + await context.enqueue_links(selector='.morelink') + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://news.ycombinator.com/']) + + +if __name__ == '__main__': + asyncio.run(main()) From cb609b8c427e0d426dd4a12a0e91a3bd17a86792 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 31 Mar 2026 09:18:30 +0200 Subject: [PATCH 2/2] docs: add CloakBrowser to v1.6 versioned avoid-blocking guide Co-Authored-By: Claude Opus 4.6 (1M context) --- .../version-1.6/guides/avoid_blocking.mdx | 9 +++ .../playwright_with_cloakbrowser.py | 81 +++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 website/versioned_docs/version-1.6/guides/code_examples/avoid_blocking/playwright_with_cloakbrowser.py diff --git a/website/versioned_docs/version-1.6/guides/avoid_blocking.mdx b/website/versioned_docs/version-1.6/guides/avoid_blocking.mdx index 423338dcfe..cf4c008783 100644 --- a/website/versioned_docs/version-1.6/guides/avoid_blocking.mdx +++ b/website/versioned_docs/version-1.6/guides/avoid_blocking.mdx @@ -10,6 +10,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import PlaywrightDefaultFingerprintGenerator from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_fingerprint_generator.py'; import PlaywrightWithCamoufox from '!!raw-loader!roa-loader!../examples/code_examples/playwright_crawler_with_camoufox.py'; +import PlaywrightWithCloakBrowser from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_cloakbrowser.py'; import PlaywrightDefaultFingerprintGeneratorWithArgs from '!!raw-loader!./code_examples/avoid_blocking/default_fingerprint_generator_with_args.py'; @@ -41,6 +42,14 @@ In some cases even `PlaywrightCrawler` +## Using CloakBrowser + +For sites with aggressive anti-bot protection, [CloakBrowser](https://github.com/CloakHQ/CloakBrowser) takes a different approach. Instead of overriding fingerprints at the JavaScript level (which anti-bot scripts can detect as tampering), CloakBrowser ships a custom Chromium binary with fingerprints modified directly in the C++ source code. It is also Chromium-based, which can matter when a target site behaves differently with Firefox than with Chrome. Install it separately with `pip install cloakbrowser` — the plugin calls `ensure_binary()` which automatically downloads and caches the Chromium binary on first run. + + + {PlaywrightWithCloakBrowser} + + **Related links** - [Fingerprint Suite Docs](https://github.com/apify/fingerprint-suite) diff --git a/website/versioned_docs/version-1.6/guides/code_examples/avoid_blocking/playwright_with_cloakbrowser.py b/website/versioned_docs/version-1.6/guides/code_examples/avoid_blocking/playwright_with_cloakbrowser.py new file mode 100644 index 0000000000..508acae0bb --- /dev/null +++ b/website/versioned_docs/version-1.6/guides/code_examples/avoid_blocking/playwright_with_cloakbrowser.py @@ -0,0 +1,81 @@ +import asyncio + +# CloakBrowser is an external package. Install it separately. +from cloakbrowser.config import IGNORE_DEFAULT_ARGS, get_default_stealth_args +from cloakbrowser.download import ensure_binary +from typing_extensions import override + +from crawlee.browsers import ( + BrowserPool, + PlaywrightBrowserController, + PlaywrightBrowserPlugin, +) +from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext + + +class CloakBrowserPlugin(PlaywrightBrowserPlugin): + """Example browser plugin that uses CloakBrowser's patched Chromium, + but otherwise keeps the functionality of PlaywrightBrowserPlugin. + """ + + @override + async def new_browser(self) -> PlaywrightBrowserController: + if not self._playwright: + raise RuntimeError('Playwright browser plugin is not initialized.') + + binary_path = ensure_binary() + stealth_args = get_default_stealth_args() + + # Merge CloakBrowser stealth args with any user-provided launch options. + launch_options = dict(self._browser_launch_options) + launch_options.pop('executable_path', None) + launch_options.pop('chromium_sandbox', None) + existing_args = list(launch_options.pop('args', [])) + launch_options['args'] = [*existing_args, *stealth_args] + + return PlaywrightBrowserController( + browser=await self._playwright.chromium.launch( + executable_path=binary_path, + ignore_default_args=IGNORE_DEFAULT_ARGS, + **launch_options, + ), + max_open_pages_per_browser=1, + # CloakBrowser handles fingerprints at the binary level. + header_generator=None, + ) + + +async def main() -> None: + crawler = PlaywrightCrawler( + # Limit the crawl to max requests. Remove or increase it for crawling all links. + max_requests_per_crawl=10, + # Custom browser pool. Gives users full control over browsers used by the crawler. + browser_pool=BrowserPool(plugins=[CloakBrowserPlugin()]), + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Extract some data from the page using Playwright's API. + posts = await context.page.query_selector_all('.athing') + for post in posts: + # Get the HTML elements for the title and rank within each post. + title_element = await post.query_selector('.title a') + + # Extract the data we want from the elements. + title = await title_element.inner_text() if title_element else None + + # Push the extracted data to the default dataset. + await context.push_data({'title': title}) + + # Find a link to the next page and enqueue it if it exists. + await context.enqueue_links(selector='.morelink') + + # Run the crawler with the initial list of URLs. + await crawler.run(['https://news.ycombinator.com/']) + + +if __name__ == '__main__': + asyncio.run(main())