From 87a846bf83ebbe4130b95368fba99bba6ce3ca7b Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Sat, 28 Mar 2026 10:43:56 +0100 Subject: [PATCH] perf: offload BeautifulSoup parsing to a thread via asyncio.to_thread() Co-Authored-By: Claude Opus 4.6 (1M context) --- .../crawlers/_beautifulsoup/_beautifulsoup_parser.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py index 735444a576..78c2887cc2 100644 --- a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py +++ b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio from typing import TYPE_CHECKING, Literal from bs4 import BeautifulSoup, Tag @@ -23,11 +24,12 @@ def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None: @override async def parse(self, response: HttpResponse) -> BeautifulSoup: - return BeautifulSoup(await response.read(), features=self._parser) + body = await response.read() + return await asyncio.to_thread(BeautifulSoup, body, features=self._parser) @override async def parse_text(self, text: str) -> BeautifulSoup: - return BeautifulSoup(text, features=self._parser) + return await asyncio.to_thread(BeautifulSoup, text, features=self._parser) @override def is_matching_selector(self, parsed_content: Tag, selector: str) -> bool: