Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/guides/avoid_blocking.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import PlaywrightDefaultFingerprintGenerator from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_fingerprint_generator.py';
import PlaywrightWithCamoufox from '!!raw-loader!roa-loader!../examples/code_examples/playwright_crawler_with_camoufox.py';
import PlaywrightWithCloakBrowser from '!!raw-loader!roa-loader!./code_examples/avoid_blocking/playwright_with_cloakbrowser.py';

import PlaywrightDefaultFingerprintGeneratorWithArgs from '!!raw-loader!./code_examples/avoid_blocking/default_fingerprint_generator_with_args.py';

Expand Down Expand Up @@ -41,6 +42,14 @@ In some cases even <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</Ap
{PlaywrightWithCamoufox}
</RunnableCodeBlock>

## Using CloakBrowser

For sites with aggressive anti-bot protection, [CloakBrowser](https://github.com/CloakHQ/CloakBrowser) takes a different approach. Instead of overriding fingerprints at the JavaScript level (which anti-bot scripts can detect as tampering), CloakBrowser ships a custom Chromium binary with fingerprints modified directly in the C++ source code. It is also Chromium-based, which can matter when a target site behaves differently with Firefox than with Chrome. Install it separately with `pip install cloakbrowser` — the plugin calls `ensure_binary()` which automatically downloads and caches the Chromium binary on first run.

<RunnableCodeBlock className="language-python" language="python">
{PlaywrightWithCloakBrowser}
</RunnableCodeBlock>

**Related links**

- [Fingerprint Suite Docs](https://github.com/apify/fingerprint-suite)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import asyncio

# CloakBrowser is an external package. Install it separately.
from cloakbrowser.config import IGNORE_DEFAULT_ARGS, get_default_stealth_args
from cloakbrowser.download import ensure_binary
from typing_extensions import override

from crawlee.browsers import (
BrowserPool,
PlaywrightBrowserController,
PlaywrightBrowserPlugin,
)
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext


class CloakBrowserPlugin(PlaywrightBrowserPlugin):
"""Example browser plugin that uses CloakBrowser's patched Chromium,
but otherwise keeps the functionality of PlaywrightBrowserPlugin.
"""

@override
async def new_browser(self) -> PlaywrightBrowserController:
if not self._playwright:
raise RuntimeError('Playwright browser plugin is not initialized.')

binary_path = ensure_binary()
stealth_args = get_default_stealth_args()

# Merge CloakBrowser stealth args with any user-provided launch options.
launch_options = dict(self._browser_launch_options)
launch_options.pop('executable_path', None)
launch_options.pop('chromium_sandbox', None)
existing_args = list(launch_options.pop('args', []))
launch_options['args'] = [*existing_args, *stealth_args]

return PlaywrightBrowserController(
browser=await self._playwright.chromium.launch(
executable_path=binary_path,
ignore_default_args=IGNORE_DEFAULT_ARGS,
**launch_options,
),
max_open_pages_per_browser=1,
# CloakBrowser handles fingerprints at the binary level.
header_generator=None,
)


async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
# Custom browser pool. Gives users full control over browsers used by the crawler.
browser_pool=BrowserPool(plugins=[CloakBrowserPlugin()]),
)

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Extract some data from the page using Playwright's API.
posts = await context.page.query_selector_all('.athing')
for post in posts:
# Get the HTML elements for the title and rank within each post.
title_element = await post.query_selector('.title a')

# Extract the data we want from the elements.
title = await title_element.inner_text() if title_element else None

# Push the extracted data to the default dataset.
await context.push_data({'title': title})

# Find a link to the next page and enqueue it if it exists.
await context.enqueue_links(selector='.morelink')

# Run the crawler with the initial list of URLs.
await crawler.run(['https://news.ycombinator.com/'])


if __name__ == '__main__':
asyncio.run(main())